tidy initial commit

ScholCommLab · Apr 5, 2023 · 458a294 · 458a294
1 parent c4ea22b
commit 458a294
Show file tree

Hide file tree

Showing 12 changed files with 8,065 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Ignore data directory
+data/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/Collect Tweets with Twint.ipynb b/Collect Tweets with Twint.ipynb
@@ -0,0 +1,327 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Last updated: March 31, 2023\n",
+    "Last run: April, 2021\n",
+    "\n",
+    "**Data Collection**\n",
+    "\n",
+    "## Second-order Effects in Altmetrics: A Case Study Analyzing the Audiences of COVID-19 Research in the News and on Social Media\n",
+    "\n",
+    "Juan Pablo Alperin, Alice Fleerackers, Michelle Riedlinger & Stefanie Haustein\n",
+    "\n",
+    "**Related Publication:**\n",
+    "Alperin, J.P., Fleerackers, A., Riedlinger, M. & Haustein, S. (2023). Second-order Effects in Altmetrics: A Case Study Analyzing the Audiences of COVID-19 Research in the News and on Social Media. *Zenodo*. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "*Caveat: This code was cleaned up from its messy version that required solving many small data collection glitches in the original version. In particular, the original collection had some issues with twitter id's and twitter user id's being recorded in scientific notation. As such, it may not work perfectly.*\n",
+    "\n",
+    "*The code does faithfully captures the main approach and code used for data collection.*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datetime\n",
+    "import pandas as pd\n",
+    "import requests\n",
+    "\n",
+    "from tqdm.auto import tqdm\n",
+    "tqdm.pandas()\n",
+    "\n",
+    "from urllib.parse import unquote\n",
+    "\n",
+    "from pymed import PubMed\n",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a file with the possible URLs for each article\n",
+    "Begins by finding a DOI for each Pubmed ID, then resolves (unshortens) the DOI URL."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Input a list of all the Pubmed IDs from our query\n",
+    "articles = pd.read_csv('data/covid_pubmed_ids-20210223.csv', header=None)\n",
+    "articles.columns = ['pmid']\n",
+    "\n",
+    "def get_doi(pmid):\n",
+    "    randint = random.randint(0,10000)\n",
+    "    email = 'nospam+%[email protected]' % randint\n",
+    "    pubmed = PubMed(tool=\"research\", email=email)    \n",
+    "    results = list(pubmed.query(pmid, max_results=1))\n",
+    "    try: \n",
+    "        article = results[0]\n",
+    "        return article.doi\n",
+    "    except: \n",
+    "        return None\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Fetches the DOI for each using the Pubmed API\n",
+    "articles['doi'] = articles.pmid.progress_apply(get_doi)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def unshort(url):\n",
+    "    try:\n",
+    "        r = requests.get(url, allow_redirects=True, timeout=15)\n",
+    "        return r.url\n",
+    "    except:\n",
+    "        return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Take the DOI URL and resolve it to find out what it links to\n",
+    "articles['resolved_url'] = articles.doi.progress_apply(lambda doi: unshort('https://doi.org/%s' % doi))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "articles['doi_url1'] = articles.doi.map(lambda doi: 'https://doi.org/%s' % doi)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Output file. This file was used as input for Crowdtangle Queries\n",
+    "articles.to_csv('data/covid_dois_in_4_outlets_with_urls.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outlets_of_interest = ['MSN', 'New York Times', 'BBC News', 'The Guardian', 'Washington Post']\n",
+    "domains_of_interest = ['www.msn.com', 'www.nytimes.com', 'www.bbc.com', 'www.theguardian.com', 'www.washingtonpost.com']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Collect Twitter activity using Twint\n",
+    "All tweets collected are placed in a \"tweets\" folder. \n",
+    "(these cannot be made publicly available and would need to be collected again)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import twint\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply() # makes things go faster by doing async searches\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Configure\n",
+    "def twint_search(url, outfile = False):\n",
+    "    try:\n",
+    "        c = twint.Config()\n",
+    "        # Search for Everything in 2020 PLUS INCLUDE January 2021\n",
+    "        c.Search = \"%s since:2020-01-01 until:2021-02-01 filter:links\" % unquote(url)\n",
+    "        print(unquote(url))\n",
+    "        c.Pandas = True\n",
+    "        c.Hide_output = True\n",
+    "\n",
+    "        # Run\n",
+    "        twint.run.Search(c)\n",
+    "        \n",
+    "        search_results = twint.storage.panda.Tweets_df\n",
+    "\n",
+    "        print(search_results.shape)\n",
+    "\n",
+    "        if outfile and search_results.shape[0] > 0: \n",
+    "            try: \n",
+    "                tweets = pd.read_csv(outfile, dtype={'tweet_id': str, 'user_id_str': str}, low_memory=False)\n",
+    "                tweets = tweets.append(search_results, ignore_index=True)\n",
+    "                \n",
+    "            except:\n",
+    "                tweets = search_results\n",
+    "                \n",
+    "            tweets.to_csv(outfile, index=False)\n",
+    "            \n",
+    "        return search_results.shape[0]\n",
+    "    except KeyboardInterrupt:\n",
+    "        raise\n",
+    "    except:\n",
+    "        print(\"Error: %s\" % url)\n",
+    "        return None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Collect Tweets about News Stories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input file from Altmetric Explorer query\n",
+    "df = pd.read_csv('data/altmetric_news_mentions.csv')\n",
+    "\n",
+    "df['url_clean'] = df.URL.map(lambda x: x[:x.find('?')].strip('/') if x.find('?') > 0 else x.strip('/'))\n",
+    "df['domain'] = df.URL.map(lambda x: x.split('/')[2])\n",
+    "df = df[df.domain.isin(domains_of_interest)]\n",
+    "\n",
+    "story_urls = df[['outlet', 'URL', 'url_clean']].drop_duplicates(subset='url_clean')\n",
+    "\n",
+    "# story_urls.to_excel('data/altmetric_unique_story_urls_top5outlets.xlsx', index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Collect all the tweets, one outlet at a time just to keep things tidier\n",
+    "\n",
+    "for outlet in outlets_of_interest:\n",
+    "    tweets = None\n",
+    "    now = datetime.datetime.now().strftime('%Y%m%d_%H%M')\n",
+    "    outfile = 'tweets/%s_tweets_%s.csv' % (outlet.lower().replace(' ', '_'), now)\n",
+    "\n",
+    "    print('Going after %s' % outlet)\n",
+    "    story_urls.loc[story_urls.outlet == outlet, 'num_tweets'] = story_urls[story_urls.outlet == outlet].url_clean.progress_apply(lambda url: twint_search(url, outfile))\n",
+    "    \n",
+    "## The final file used in research is a merger of all of the output files from this step\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Collect Tweets about research "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "all_urls = set(articles.resolved_url).union(articles.doi_url1)\n",
+    "\n",
+    "df = articles.drop_duplicates(subset='doi_url1')\n",
+    "tweets = None\n",
+    "\n",
+    "outfile = 'tweets/research_tweets.csv'\n",
+    "\n",
+    "# this collects the tweets and saves the num found for each one\n",
+    "# tweets themselves are saved to the outfile\n",
+    "df['num_tweets'] = df.doi_url1.progress_apply(lambda url: twint_search(url, outfile))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tweets = pd.read_csv('tweets/research_tweets.csv')\n",
+    "tweets.dropna(subset=['search'], inplace=True)\n",
+    "tweets['url_clean'] = tweets.search.map(lambda x: x.split(' ')[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tweets = tweets[tweets.search.notna()]\n",
+    "\n",
+    "tweets['url_clean'] = tweets.search.map(lambda x: x.split(' ')[0])\n",
+    "tweets.id = tweets.id.astype(str)\n",
+    "tweets = tweets[tweets['id'].notna()]\n",
+    "tweets.set_index('id', inplace=True)\n",
+    "tweets.index.name = 'tweet_id'\n",
+    "tweets.to_csv('citations/twint_research_url_mentions.csv')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}