From 69c2ac41132df97f2b23e36c3fc917145ed7a558 Mon Sep 17 00:00:00 2001 From: Krzysztof Nowak Date: Wed, 30 May 2018 11:25:02 +0200 Subject: [PATCH] cleaned up communities classifier notebook Signed-off-by: Krzysztof Nowak --- communities-sklearn.ipynb | 218 +++++++------------------------------- 1 file changed, 37 insertions(+), 181 deletions(-) diff --git a/communities-sklearn.ipynb b/communities-sklearn.ipynb index ab08eb6..5fce066 100644 --- a/communities-sklearn.ipynb +++ b/communities-sklearn.ipynb @@ -2,10 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import json\n", @@ -27,10 +25,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "with open(\"./comms.json\", \"r\") as fp:\n", @@ -39,10 +35,8 @@ }, { "cell_type": "code", - "execution_count": 107, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "spam = ['lybinh', 'tieutieuhiep480549',]\n", @@ -71,10 +65,8 @@ }, { "cell_type": "code", - "execution_count": 109, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "previously_deleted_spam = ['5400', 'obatsakitmaagyangmanjurdanseringdicari', 'agen-poker', 'boyaqq', 'cekipokernet-agen-poker-online-android-uang-asli-terbaik-indonesia', 'tuanpoker', 'jeeptoto', 'test321', 'mycommunity', 'grosiramazonplus', 'ie', 'strangers_in_the_night', 'egames', 'zenodo-testing', 'the-biggest-online-sports-betting-site-in-malaysia', 'shira', 'thethaoqq188', 'searchengineoptimization', 'pokeronlineterpercaya', 'best-gaming-laptops', 'loto188', 'bandarceme', 'review3', 'domino99', 'sayangseo', 'bongdaqq188', 'jnepoker', 'onlineslotqq101-com-slot-machine-games', 'test-123', 'onlinecasinoqq', 'agen-judi-poker-online-terbaik', 'wargakartu', 'cemarapoker-situs-judi-poker-dan-domino-online-terpercaya-1', 'casino-website', 'sahabatkartucom', 'sahabatqqcasinocom', 'dewa_poker', 'kontesseo3', 'j', '11111111', 'oo22', 'ngentod', 'cemarapoker-situs-judi-poker-dan-domino-online-terpercaya-2506', 'qjoker', '043', 'slotqq188marimain', '002', '288', 'test1000', 'sahabatkartu-com-agen-poker-online', 'sahabatqq-casino', 'rajapoker', 'infojudi', 'sayangseo1', 'masterjudi88', 'ledstairnosingaustralia', 'partyvenueschicago', 'sportsqq288com-the-biggest-online-sports-betting-site-in-malaysia', 'kuramang', 'sahabatkartu', '021', 'interqqcom', '098', 'longxaodua', 'situsbandardanagenjudipokeruangasliterbesardanterpercayaindonesia', 'akifa_naila', 'masteragen', 'ajoqq', 'ichadinitraqq188', 'agen-poker-resmi-terpercaya-dan-terbaik', 'cemarapoker-situs-judi-poker-dan-domino-online-terpercaya', 'cemarapoker', 'interqq', '001', 'kapalpoker-com', 'kapal_poker', 'jeniusseo', '365365', '087', 'amour', 'pandawa', '034', 'casinovietqq288hh', 'trusted-live-casino-gambling-website-in-malaysia', 'sahabatqq-casino-agen-domino-99-dan-poker-online-terbesar-di-asia', 'pokerdominoqq-online-situs-agen-poker-domino-qq-online-terpercaya', 'pokerdominoseo', 'jadipuas1', '086', '12', 'sahabatqqcasinoagendomino99danpokeronlineterbesardiasia', '112', 'pokerqq288', 'qq288', 'rajapoker88-situs-agen-judi-poker-bandar-domino-qq-online-terpercaya', 'putrilaura', '553', 'sahabatkartu-com-agen-poker-domino-99-online-bandarq-terpercaya-indonesia', 'seo', 'sahabatqqcasino', '007', 'sarana-pelangi-agen-judi-domino-qq-bandar-poker-dan-bandar-qiu-qiu-99-terpercaya-seasia', 'saranapelangiagen', 'superbejoq', 'bolaqiuqiu', 'bandarjudi', 'casinoqq188', 'casinoqq288', 'casinovietqq101', 'casinovietqq188', 'casinovnqq188', 'qq188', 'casinovqq188', 'linda', '101', 'livecasinoonlineqq101', 'livecasinoonlineqq', 'nandalistiohadi', 'nha-cai-danh-bai-truc-tuyen-casino-viet-qq288', 'casino', 'casinowebsite', 'live-casino-website', 'the-best-live-casinos-site-in-malaysia', 'live-casino', 'online-casino', 'sahabatqqdotcasino', 'sarana118-com-agen-judi-sbobet-live-casino-dan-togel-online-terpercaya-seasia', 'sarana118-com', 'casinoq288', 'casino-e-game-hap-dan-nhat-hien-nay', 'livecasinoonline', 'alternatifnyasehat', 'bliherbal', 'obatherbalsatu', 'herbalufi', 'hendi', 'bbbbbbbbbbb222222222222222', 'diherbalamazonplus', 'buycbdoil', 'mooremike', 'mooremike3', 'frankcurtis04', 'tokobukuonline', '1', 'agendomino99', 'agen_togel_online_terpercaya_zodiaktoto', 'bettingqq101', 'betting101', 'sports', 'slotgames', 'androidslots', 'slot-games-online', 'slot', 'slotmachines', 'onlineslotqq101-slot-machine-games-free-slot-betting-website', 'slotqq188', 'slot-machine', 'online-sports-betting', 'football288-', 'bettingslotqq188', 'onlineslotqq1881', 'onlineslotqq188', 'onlineslotqq288', 'bolaqq188', 'cdbd288', 'ilottoqq188', 'qq188asia', 'qq188asia-best-online-sports-bookie-website-asia-top-free-bets-bookmaker', 'bettingonline', 'onlinebetting', 'betting', 'sportsbetting', 'bettingsports', 'thethaovqq188', 'songbai', '188', 'review_terpercaya', 'sportsbook', 'sahabatqqcasino-agen-domino-99-dan-poker-online-terbesar-di-asia', 'thaothaoonline', 'cuocbongdaqq188', 'auduongkhac', 'nhacaitructuyen', 'gameonline', 'comrang', 'blackjacktvietqq288', 'bolaqq288', 'menangbesarbolaqq288', 'songvedem', 'bongdaqq288', 'cemeqq288-com-situs-agen-judi-ceme-online-indonesia', 'situsjudiqq288', 'poker', 'baotichnhuoc', 'test42', 'thethao', 'thethaoqq288', 'qq101', 'agensportsbookqq101', 'bandarbolaqq101', 'slotgamevietqq101', '77', 'slot188', 'bwinqq', 'caridomino', 'judi-kartu-domino-online-cocok-menjadi-permainan-semua-orang', 'saranapelangiagenjudidominoqq', 'sejarahqq-net-agen-dominoqq-online-bandarq-terpercaya', 'wtcdomino-com', 'wtcdomino', 'agentqjoker', 'rickpetko91795', 'rajaseoweb', 'agen-judi-kartu-permainan-terlengkap', '112345', 'sports-betting', 'sellmyhousefastindianapolis', 'testing', 'casinoonlineqq', 'buestestcommunity', 'herbalkankerpayudara11', 'online-betting', 'simple-and-basic-tips-and-advises-to-improve-your-online-casino-gaming', 'bandar', 'naga388', 'depoqqnetagenbandarqdominoqiuqiu', 'sahabatqq1', 'toko4d', '097', 'subhajit', 'infobet99', 'obatfrigid', 'kue-lebaran', 'aa', 'smallbusiness', 'bandarpelangi2']\n", @@ -83,30 +75,17 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "895" - ] - }, - "execution_count": 110, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(set(spam))" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Communities which were removed by manually checked to not be spam\n", @@ -115,10 +94,8 @@ }, { "cell_type": "code", - "execution_count": 111, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "X = []\n", @@ -131,20 +108,9 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "895" - ] - }, - "execution_count": 112, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len([x for x in X if x['spam']])" ] @@ -159,9 +125,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X_maybespam = [x for x in X if not x['spam'] and\n", @@ -174,9 +138,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "new_spam_ids = [x['id'] for x in X_maybespam]\n", @@ -186,9 +148,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "for x in X_maybespam:\n", @@ -204,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -240,10 +200,8 @@ }, { "cell_type": "code", - "execution_count": 140, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "acc = [idx for idx, (ref, pred) in enumerate(zip(y_test, y_pred)) if (ref, pred) == (False, True)]\n", @@ -252,53 +210,18 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('agriprima',\n", - " 'Agriprima, Journal of Applied Agricultural Sciences',\n", - " '

Agriprima, Journal of Applied Agricultural Sciences adalah Jurnal Ilmu Pertanian Terapan yang menjadi sarana bagi peneliti untuk mempublikasikan hasil penelitiannya dalam lingkup pemuliaan tanaman, bioteknologi tanaman, teknologi benih, perlindungan tanaman, dan kesuburan tanah.

\\r\\n\\r\\n

Agriprima diterbitkan oleh Jurusan Produksi Pertanian Politeknik Negeri Jember bekerjasama dengan Politeknik, Fakultas Pertanian serta Pusat Penelitian Kopi dan Kakao Indonesia.

\\r\\n'),\n", - " ('talkinmaths',\n", - " 'Talk in Mathematics',\n", - " '

This is a collection of transcriptions of mathematics classroom interactions.  It is intended to grow over time and diversify to include a wider range of transcriptions, such as transcriptions of groups working on mathematics tasks, and transcriptions of lessons in subjects other than mathematics

\\r\\n'),\n", - " ('prueba', 'Prueba', '

test

\\r\\n')]" - ] - }, - "execution_count": 141, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spammy_stuff" ] }, { "cell_type": "code", - "execution_count": 119, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'curation_policy': '',\n", - " 'deleted_at': 'None',\n", - " 'description': '

Grup LSKK selalu membuka diri untuk bekerja sama dengan sebanyak mungkin mitra untuk mencapai pencapaian terbaik

\\r\\n',\n", - " 'id': 'lskk',\n", - " 'id_user': 23096,\n", - " 'page': '',\n", - " 'spam': True,\n", - " 'title': 'Lab. Sistem Kendali dan Komputer'}" - ] - }, - "execution_count": 119, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "[x for x in X if x['id'] == 'lskk'][0]" ] @@ -312,10 +235,8 @@ }, { "cell_type": "code", - "execution_count": 155, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import LeaveOneOut, KFold\n", @@ -359,23 +280,9 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Counter({(False, False): 1168,\n", - " (False, True): 10,\n", - " (True, False): 27,\n", - " (True, True): 868})" - ] - }, - "execution_count": 156, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "acc = [(ref, pred) for ref, pred in zip(y, res)]\n", "Counter(acc)" @@ -395,10 +302,8 @@ }, { "cell_type": "code", - "execution_count": 159, - "metadata": { - "collapsed": true - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "acc = [idx for idx, (ref, pred) in enumerate(zip(y, res)) if (ref, pred) == (False, True)]\n", @@ -407,47 +312,9 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('docshyr',\n", - " 'ДокШир',\n", - " '

ДокШир створено задля безперешкодного поширення ініціатив руху Відкритого доступу серед українських фахівців бібліотечної справи.

\\r\\n'),\n", - " ('researchtools',\n", - " 'Research Tools Box By: Dr. Nader Ale Ebrahim',\n", - " '

This Topic is designed to assist students to aim at reducing the search time by increasing their knowledge to more effectively use the "Research Tools" which is available through the Net.

\\r\\n\\r\\n

Created by Nader Ale Ebrahim

\\r\\n'),\n", - " ('kne-test',\n", - " 'The Egyptian Chemical Society',\n", - " '\\r\\n\\r\\n\\r\\nAbout Journal Sidebar\\r\\n\\r\\n\\r\\n


\\r\\n

\\r\\n

Egyptian Journal of Chemistry Produced\\r\\nand hosted on behalf Academy of Scientific Research and Technology\\r\\nInformatics Sector and Scientific Services The National Centre for\\r\\nInformation and Documentation (NIDOC), Dokki, Cairo, Egypt

\\r\\n

Edited by:    The\\r\\nEgyptian Chemical  Society
\\r\\nISSN:            0449-2285

\\r\\n

 

\\r\\n\\r\\n'),\n", - " ('prueba', 'Prueba', '

test

\\r\\n'),\n", - " ('cobp',\n", - " 'The Complexity of Obesity Proceedings',\n", - " '

This is an open access conference journal according to the Norwegian Social Science Data Services. We publish proceedings that have been satisfactory presented at "The Complexity of Obesity Conferences" or related events. 

\\r\\n'),\n", - " ('saranauthorunismuh',\n", - " 'LP3M Unismuh Makassar',\n", - " '

Lembaga Penenlitian, Pengembangan dan Pengabdian Masyarakat

\\r\\n'),\n", - " ('semfefu',\n", - " 'Школа экономики и менеджмента Дальневосточного федерального университета',\n", - " '

Школа экономики и менеджмента включает в себя 14 кафедр, реализующих широкий спектр образовательных программ по всем направлениям подготовки экономистов и менеджеров: 13 образовательных программ бакалавриата и 22 магистерских программы. Школа активно развивает экономическое, технологическое и культурное сотрудничество со странами АТР и другими государствами. Выпускники Школы востребованы в крупнейших российских и международных компаниях.

\\r\\n'),\n", - " ('talkinmaths',\n", - " 'Talk in Mathematics',\n", - " '

This is a collection of transcriptions of mathematics classroom interactions.  It is intended to grow over time and diversify to include a wider range of transcriptions, such as transcriptions of groups working on mathematics tasks, and transcriptions of lessons in subjects other than mathematics

\\r\\n'),\n", - " ('agriprima',\n", - " 'Agriprima, Journal of Applied Agricultural Sciences',\n", - " '

Agriprima, Journal of Applied Agricultural Sciences adalah Jurnal Ilmu Pertanian Terapan yang menjadi sarana bagi peneliti untuk mempublikasikan hasil penelitiannya dalam lingkup pemuliaan tanaman, bioteknologi tanaman, teknologi benih, perlindungan tanaman, dan kesuburan tanah.

\\r\\n\\r\\n

Agriprima diterbitkan oleh Jurusan Produksi Pertanian Politeknik Negeri Jember bekerjasama dengan Politeknik, Fakultas Pertanian serta Pusat Penelitian Kopi dan Kakao Indonesia.

\\r\\n'),\n", - " ('open-literature-reviews',\n", - " 'Open Literature Reviews',\n", - " '

This community is for all researchers who want to share the datasets from their systematic literature reviews or mapping studies. See format instructions on the about page.

\\r\\n')]" - ] - }, - "execution_count": 160, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spammy_stuff" ] @@ -461,20 +328,9 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['2017_06_18_communities_spam.pkl']" - ] - }, - "execution_count": 166, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from sklearn.externals import joblib\n", "joblib.dump(text_clf, '2017_06_18_communities_spam.pkl') " @@ -497,7 +353,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.6.5" } }, "nbformat": 4,