Retracted Paper#
We use OpenAlex to retrieve all articles which are a preprint, but have so far not been published by a peer-reviewed journal.
Load libraries#
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
from itertools import chain
import pandas as pd
import pyalex
Define helper functions#
# Determine if any of the locations (journals) the paper is already published
def is_any_location_published(locations):
for location in locations:
if location['version'] == 'publishedVersion':
return True
return False
# Combine all authos
def join_authors(list_of_authors):
return ', '.join([author['author']['display_name'] for author in list_of_authors])
# Extract key information from the locations
def join_locations(list_of_locations):
summary = []
for location in list_of_locations:
if location['source']:
summary.append(f"{location['version']}: {location['source']['host_organization_name']} - {location['landing_page_url']}")
else:
summary.append(f"{location['version']} - {location['landing_page_url']}")
return ', '.join(summary)
Set the Topic & Year#
Set the year and the number of papers you want to obtain
# Variables reduce the size of the output and the time required for execution
topic = 'COVID'
year = 2023
n_max = 500 # when set to None all papers are queried
Get the preprints#
Run te following code to get the preprints for the specified parameters
query = Works().search(topic).filter(type="article", publication_year=year, primary_location={'version': 'submittedVersion'}, locations={'is_published': False}).sort(cited_by_count="desc")
preprints = []
# Iterate over all query results
for item in chain(*query.paginate(per_page=200, n_max=n_max)):
# Get key properties
oa_id = item.get('id', None)
title = item.get('title', None)
publication_date = item.get('publication_date', None)
doi = item.get('doi', None)
cited_by_count = item.get('cited_by_count', None)
locations_count = item.get('locations_count', None)
# Join all authors
authors = join_authors(item['authorships'])
locations = item.get('locations', None)
locations_overview = join_locations(item['locations'])
# Only append the paper to the preprints if is not published in any other journal
if locations_count == 1 or not is_any_location_published(locations):
preprints.append({'id': oa_id, 'title': title, 'publication_date': publication_date, 'doi': doi,
'cited': cited_by_count, 'authors': authors,
'locations': locations_overview, 'location_count': locations_count})
Store the data#
df = pd.DataFrame.from_dict(preprints)
df.to_csv(f'./openalex_preprints_{year}_{n_max}.csv')
df.to_excel(f'./openalex_preprints_{year}_{n_max}.xlsx')
df.head()
id | title | publication_date | doi | cited | authors | locations | location_count | |
---|---|---|---|---|---|---|---|---|
0 | https://openalex.org/W4318909870 | Overcoming Vaccine Skepticism in Pakistan: A C... | 2023-02-02 | https://doi.org/10.5281/zenodo.7597141 | 13 | Bibi Aisha Sadiqa | submittedVersion: European Organization for Nu... | 1 |
1 | https://openalex.org/W3197911323 | The Political Economy of a Modern Pandemic: As... | 2023-07-24 | https://doi.org/10.32920/23739360.v1 | 7 | John Shields, Zainab Abu Alrob | submittedVersion - https://doi.org/10.32920/23... | 2 |
2 | https://openalex.org/W4319655309 | Navigating the Post-COVID Market: A Prospectiv... | 2023-02-09 | https://doi.org/10.5281/zenodo.7625190 | 7 | Peng Sun, Xiaode Zuo | submittedVersion: European Organization for Nu... | 1 |
3 | https://openalex.org/W4316927004 | Impact of Data Pre-Processing on Covid-19 Diag... | 2023-01-03 | https://doi.org/10.5281/zenodo.7543986 | 5 | Dina Salem, Esraa.M.Hashim | submittedVersion: European Organization for Nu... | 1 |
4 | https://openalex.org/W4321795579 | Covid and Productivity in Europe: A Responsive... | 2023-01-01 | https://doi.org/10.2139/ssrn.4359120 | 4 | Russell Cooper, Carl-Wolfram Horn, Leonardo In... | submittedVersion - https://doi.org/10.2139/ssr... | 1 |
Get a sample paper#
paper = df.iloc[0]
paper
id https://openalex.org/W4318909870
title Overcoming Vaccine Skepticism in Pakistan: A C...
publication_date 2023-02-02
doi https://doi.org/10.5281/zenodo.7597141
cited 13
authors Bibi Aisha Sadiqa
locations submittedVersion: European Organization for Nu...
location_count 1
Name: 0, dtype: object