-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
8,065 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
# Ignore data directory | ||
data/ | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,327 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Last updated: March 31, 2023\n", | ||
"Last run: April, 2021\n", | ||
"\n", | ||
"**Data Collection**\n", | ||
"\n", | ||
"## Second-order Effects in Altmetrics: A Case Study Analyzing the Audiences of COVID-19 Research in the News and on Social Media\n", | ||
"\n", | ||
"Juan Pablo Alperin, Alice Fleerackers, Michelle Riedlinger & Stefanie Haustein\n", | ||
"\n", | ||
"**Related Publication:**\n", | ||
"Alperin, J.P., Fleerackers, A., Riedlinger, M. & Haustein, S. (2023). Second-order Effects in Altmetrics: A Case Study Analyzing the Audiences of COVID-19 Research in the News and on Social Media. *Zenodo*. " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"*Caveat: This code was cleaned up from its messy version that required solving many small data collection glitches in the original version. In particular, the original collection had some issues with twitter id's and twitter user id's being recorded in scientific notation. As such, it may not work perfectly.*\n", | ||
"\n", | ||
"*The code does faithfully captures the main approach and code used for data collection.*" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import datetime\n", | ||
"import pandas as pd\n", | ||
"import requests\n", | ||
"\n", | ||
"from tqdm.auto import tqdm\n", | ||
"tqdm.pandas()\n", | ||
"\n", | ||
"from urllib.parse import unquote\n", | ||
"\n", | ||
"from pymed import PubMed\n", | ||
"import random" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Create a file with the possible URLs for each article\n", | ||
"Begins by finding a DOI for each Pubmed ID, then resolves (unshortens) the DOI URL." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Input a list of all the Pubmed IDs from our query\n", | ||
"articles = pd.read_csv('data/covid_pubmed_ids-20210223.csv', header=None)\n", | ||
"articles.columns = ['pmid']\n", | ||
"\n", | ||
"def get_doi(pmid):\n", | ||
" randint = random.randint(0,10000)\n", | ||
" email = 'nospam+%[email protected]' % randint\n", | ||
" pubmed = PubMed(tool=\"research\", email=email) \n", | ||
" results = list(pubmed.query(pmid, max_results=1))\n", | ||
" try: \n", | ||
" article = results[0]\n", | ||
" return article.doi\n", | ||
" except: \n", | ||
" return None\n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Fetches the DOI for each using the Pubmed API\n", | ||
"articles['doi'] = articles.pmid.progress_apply(get_doi)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def unshort(url):\n", | ||
" try:\n", | ||
" r = requests.get(url, allow_redirects=True, timeout=15)\n", | ||
" return r.url\n", | ||
" except:\n", | ||
" return None" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Take the DOI URL and resolve it to find out what it links to\n", | ||
"articles['resolved_url'] = articles.doi.progress_apply(lambda doi: unshort('https://doi.org/%s' % doi))\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"articles['doi_url1'] = articles.doi.map(lambda doi: 'https://doi.org/%s' % doi)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Output file. This file was used as input for Crowdtangle Queries\n", | ||
"articles.to_csv('data/covid_dois_in_4_outlets_with_urls.csv', index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"outlets_of_interest = ['MSN', 'New York Times', 'BBC News', 'The Guardian', 'Washington Post']\n", | ||
"domains_of_interest = ['www.msn.com', 'www.nytimes.com', 'www.bbc.com', 'www.theguardian.com', 'www.washingtonpost.com']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Collect Twitter activity using Twint\n", | ||
"All tweets collected are placed in a \"tweets\" folder. \n", | ||
"(these cannot be made publicly available and would need to be collected again)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import twint\n", | ||
"import nest_asyncio\n", | ||
"nest_asyncio.apply() # makes things go faster by doing async searches\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Configure\n", | ||
"def twint_search(url, outfile = False):\n", | ||
" try:\n", | ||
" c = twint.Config()\n", | ||
" # Search for Everything in 2020 PLUS INCLUDE January 2021\n", | ||
" c.Search = \"%s since:2020-01-01 until:2021-02-01 filter:links\" % unquote(url)\n", | ||
" print(unquote(url))\n", | ||
" c.Pandas = True\n", | ||
" c.Hide_output = True\n", | ||
"\n", | ||
" # Run\n", | ||
" twint.run.Search(c)\n", | ||
" \n", | ||
" search_results = twint.storage.panda.Tweets_df\n", | ||
"\n", | ||
" print(search_results.shape)\n", | ||
"\n", | ||
" if outfile and search_results.shape[0] > 0: \n", | ||
" try: \n", | ||
" tweets = pd.read_csv(outfile, dtype={'tweet_id': str, 'user_id_str': str}, low_memory=False)\n", | ||
" tweets = tweets.append(search_results, ignore_index=True)\n", | ||
" \n", | ||
" except:\n", | ||
" tweets = search_results\n", | ||
" \n", | ||
" tweets.to_csv(outfile, index=False)\n", | ||
" \n", | ||
" return search_results.shape[0]\n", | ||
" except KeyboardInterrupt:\n", | ||
" raise\n", | ||
" except:\n", | ||
" print(\"Error: %s\" % url)\n", | ||
" return None" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Collect Tweets about News Stories" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# input file from Altmetric Explorer query\n", | ||
"df = pd.read_csv('data/altmetric_news_mentions.csv')\n", | ||
"\n", | ||
"df['url_clean'] = df.URL.map(lambda x: x[:x.find('?')].strip('/') if x.find('?') > 0 else x.strip('/'))\n", | ||
"df['domain'] = df.URL.map(lambda x: x.split('/')[2])\n", | ||
"df = df[df.domain.isin(domains_of_interest)]\n", | ||
"\n", | ||
"story_urls = df[['outlet', 'URL', 'url_clean']].drop_duplicates(subset='url_clean')\n", | ||
"\n", | ||
"# story_urls.to_excel('data/altmetric_unique_story_urls_top5outlets.xlsx', index=False)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"scrolled": true, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Collect all the tweets, one outlet at a time just to keep things tidier\n", | ||
"\n", | ||
"for outlet in outlets_of_interest:\n", | ||
" tweets = None\n", | ||
" now = datetime.datetime.now().strftime('%Y%m%d_%H%M')\n", | ||
" outfile = 'tweets/%s_tweets_%s.csv' % (outlet.lower().replace(' ', '_'), now)\n", | ||
"\n", | ||
" print('Going after %s' % outlet)\n", | ||
" story_urls.loc[story_urls.outlet == outlet, 'num_tweets'] = story_urls[story_urls.outlet == outlet].url_clean.progress_apply(lambda url: twint_search(url, outfile))\n", | ||
" \n", | ||
"## The final file used in research is a merger of all of the output files from this step\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Collect Tweets about research " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"scrolled": true, | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"all_urls = set(articles.resolved_url).union(articles.doi_url1)\n", | ||
"\n", | ||
"df = articles.drop_duplicates(subset='doi_url1')\n", | ||
"tweets = None\n", | ||
"\n", | ||
"outfile = 'tweets/research_tweets.csv'\n", | ||
"\n", | ||
"# this collects the tweets and saves the num found for each one\n", | ||
"# tweets themselves are saved to the outfile\n", | ||
"df['num_tweets'] = df.doi_url1.progress_apply(lambda url: twint_search(url, outfile))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tweets = pd.read_csv('tweets/research_tweets.csv')\n", | ||
"tweets.dropna(subset=['search'], inplace=True)\n", | ||
"tweets['url_clean'] = tweets.search.map(lambda x: x.split(' ')[0])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tweets = tweets[tweets.search.notna()]\n", | ||
"\n", | ||
"tweets['url_clean'] = tweets.search.map(lambda x: x.split(' ')[0])\n", | ||
"tweets.id = tweets.id.astype(str)\n", | ||
"tweets = tweets[tweets['id'].notna()]\n", | ||
"tweets.set_index('id', inplace=True)\n", | ||
"tweets.index.name = 'tweet_id'\n", | ||
"tweets.to_csv('citations/twint_research_url_mentions.csv')" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
Oops, something went wrong.