Retracted Paper#

We use OpenAlex to retrieve all articles which are a preprint, but have so far not been published by a peer-reviewed journal.

Load libraries#

from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
from itertools import chain
import pandas as pd
import pyalex

Define helper functions#

# Determine if any of the locations (journals) the paper is already published
def is_any_location_published(locations):
    for location in locations:
        if location['version'] == 'publishedVersion':
            return True
    return False
# Combine all authos
def join_authors(list_of_authors):
    return ', '.join([author['author']['display_name'] for author in list_of_authors])
# Extract key information from the locations
def join_locations(list_of_locations):
    summary = []
    for location in list_of_locations:
        if location['source']:
            summary.append(f"{location['version']}: {location['source']['host_organization_name']} - {location['landing_page_url']}")
        else:
            summary.append(f"{location['version']} - {location['landing_page_url']}")
    return ', '.join(summary)

Set the Topic & Year#

Set the year and the number of papers you want to obtain

# Variables reduce the size of the output and the time required for execution
topic = 'COVID'
year = 2023
n_max = 500        # when set to None all papers are queried

Get the preprints#

Run te following code to get the preprints for the specified parameters

query = Works().search(topic).filter(type="article", publication_year=year, primary_location={'version': 'submittedVersion'}, locations={'is_published': False}).sort(cited_by_count="desc")

preprints = []

# Iterate over all query results
for item in chain(*query.paginate(per_page=200, n_max=n_max)):
    
    # Get key properties
    oa_id = item.get('id', None)
    title = item.get('title', None)
    publication_date = item.get('publication_date', None)
    doi = item.get('doi', None)
    cited_by_count = item.get('cited_by_count', None)
    locations_count = item.get('locations_count', None)
        
    # Join all authors
    authors = join_authors(item['authorships'])   
    locations = item.get('locations', None)
    locations_overview = join_locations(item['locations'])
    
    # Only append the paper to the preprints if is not published in any other journal
    if locations_count == 1 or not is_any_location_published(locations):
        preprints.append({'id': oa_id, 'title': title, 'publication_date': publication_date, 'doi': doi,
                          'cited': cited_by_count, 'authors': authors,
                          'locations': locations_overview, 'location_count': locations_count})

Store the data#

df = pd.DataFrame.from_dict(preprints)
df.to_csv(f'./openalex_preprints_{year}_{n_max}.csv')
df.to_excel(f'./openalex_preprints_{year}_{n_max}.xlsx')
df.head()
id title publication_date doi cited authors locations location_count
0 https://openalex.org/W4318909870 Overcoming Vaccine Skepticism in Pakistan: A C... 2023-02-02 https://doi.org/10.5281/zenodo.7597141 13 Bibi Aisha Sadiqa submittedVersion: European Organization for Nu... 1
1 https://openalex.org/W3197911323 The Political Economy of a Modern Pandemic: As... 2023-07-24 https://doi.org/10.32920/23739360.v1 7 John Shields, Zainab Abu Alrob submittedVersion - https://doi.org/10.32920/23... 2
2 https://openalex.org/W4319655309 Navigating the Post-COVID Market: A Prospectiv... 2023-02-09 https://doi.org/10.5281/zenodo.7625190 7 Peng Sun, Xiaode Zuo submittedVersion: European Organization for Nu... 1
3 https://openalex.org/W4316927004 Impact of Data Pre-Processing on Covid-19 Diag... 2023-01-03 https://doi.org/10.5281/zenodo.7543986 5 Dina Salem, Esraa.M.Hashim submittedVersion: European Organization for Nu... 1
4 https://openalex.org/W4321795579 Covid and Productivity in Europe: A Responsive... 2023-01-01 https://doi.org/10.2139/ssrn.4359120 4 Russell Cooper, Carl-Wolfram Horn, Leonardo In... submittedVersion - https://doi.org/10.2139/ssr... 1

Get a sample paper#

paper = df.iloc[0]
paper
id                                   https://openalex.org/W4318909870
title               Overcoming Vaccine Skepticism in Pakistan: A C...
publication_date                                           2023-02-02
doi                            https://doi.org/10.5281/zenodo.7597141
cited                                                              13
authors                                             Bibi Aisha Sadiqa
locations           submittedVersion: European Organization for Nu...
location_count                                                      1
Name: 0, dtype: object