diff --git a/.gitignore b/.gitignore index 9bb3a1e..4e362c3 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,7 @@ docker-compose.override.yml .config.json *.backup *_cred.json + +# services +services/**/*.bin +services/**/*.txt diff --git a/client/js/lb-services.js b/client/js/lb-services.js index 12a8826..6ad57a3 100644 --- a/client/js/lb-services.js +++ b/client/js/lb-services.js @@ -10458,15 +10458,15 @@ if (typeof module !== 'undefined' && typeof exports !== 'undefined' && * * @description * - * Twitter endpoint to receive followers + * Fetch data for various charts * * @param {Object=} parameters Request parameters. * - * - `src` – `{string=}` - the chart source + * - `src` – `{string=}` - chart source * - * - `type` – `{string=}` - the chart type + * - `type` – `{string=}` - chart type * - * - `filter` – `{object=}` - + * - `qs` – `{object=}` - querystring * * @param {function(Object,Object)=} successCb * Success callback with two arguments: `value`, `responseHeaders`. @@ -11176,6 +11176,681 @@ if (typeof module !== 'undefined' && typeof exports !== 'undefined' && + return R; + }]); + +/** + * @ngdoc object + * @name lbServices.Topic + * @header lbServices.Topic + * @object + * + * @description + * + * A $resource object for interacting with the `Topic` model. + * + * **Details** + * + * Learned topics from smposts + * + * ## Example + * + * See + * {@link http://docs.angularjs.org/api/ngResource.$resource#example $resource} + * for an example of using this object. + * + */ + module.factory( + "Topic", + [ + 'LoopBackResource', 'LoopBackAuth', '$injector', '$q', + function(LoopBackResource, LoopBackAuth, $injector, $q) { + var R = LoopBackResource( + urlBase + "/topics/:id", + { 'id': '@id' }, + { + + /** + * @ngdoc method + * @name lbServices.Topic#create + * @methodOf lbServices.Topic + * + * @description + * + * Create a new instance of the model and persist it into the data source. + * + * @param {Object=} parameters Request parameters. + * + * This method does not accept any parameters. + * Supply an empty object or omit this argument altogether. + * + * @param {Object} postData Request data. + * + * This method expects a subset of model properties as request parameters. + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + "create": { + url: urlBase + "/topics", + method: "POST", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#createMany + * @methodOf lbServices.Topic + * + * @description + * + * Create a new instance of the model and persist it into the data source. + * + * @param {Object=} parameters Request parameters. + * + * This method does not accept any parameters. + * Supply an empty object or omit this argument altogether. + * + * @param {Object} postData Request data. + * + * This method expects a subset of model properties as request parameters. + * + * @param {function(Array.,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Array.} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + "createMany": { + isArray: true, + url: urlBase + "/topics", + method: "POST", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#upsert + * @methodOf lbServices.Topic + * + * @description + * + * Update an existing model instance or insert a new one into the data source. + * + * @param {Object=} parameters Request parameters. + * + * This method does not accept any parameters. + * Supply an empty object or omit this argument altogether. + * + * @param {Object} postData Request data. + * + * This method expects a subset of model properties as request parameters. + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + "upsert": { + url: urlBase + "/topics", + method: "PUT", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#exists + * @methodOf lbServices.Topic + * + * @description + * + * Check whether a model instance exists in the data source. + * + * @param {Object=} parameters Request parameters. + * + * - `id` – `{*}` - Model id + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * Data properties: + * + * - `exists` – `{boolean=}` - + */ + "exists": { + url: urlBase + "/topics/:id/exists", + method: "GET", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#findById + * @methodOf lbServices.Topic + * + * @description + * + * Find a model instance by id from the data source. + * + * @param {Object=} parameters Request parameters. + * + * - `id` – `{*}` - Model id + * + * - `filter` – `{object=}` - Filter defining fields and include + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + "findById": { + url: urlBase + "/topics/:id", + method: "GET", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#find + * @methodOf lbServices.Topic + * + * @description + * + * Find all instances of the model matched by filter from the data source. + * + * @param {Object=} parameters Request parameters. + * + * - `filter` – `{object=}` - Filter defining fields, where, include, order, offset, and limit + * + * @param {function(Array.,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Array.} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + "find": { + isArray: true, + url: urlBase + "/topics", + method: "GET", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#findOne + * @methodOf lbServices.Topic + * + * @description + * + * Find first instance of the model matched by filter from the data source. + * + * @param {Object=} parameters Request parameters. + * + * - `filter` – `{object=}` - Filter defining fields, where, include, order, offset, and limit + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + "findOne": { + url: urlBase + "/topics/findOne", + method: "GET", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#updateAll + * @methodOf lbServices.Topic + * + * @description + * + * Update instances of the model matched by where from the data source. + * + * @param {Object=} parameters Request parameters. + * + * - `where` – `{object=}` - Criteria to match model instances + * + * @param {Object} postData Request data. + * + * This method expects a subset of model properties as request parameters. + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * The number of instances updated + */ + "updateAll": { + url: urlBase + "/topics/update", + method: "POST", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#deleteById + * @methodOf lbServices.Topic + * + * @description + * + * Delete a model instance by id from the data source. + * + * @param {Object=} parameters Request parameters. + * + * - `id` – `{*}` - Model id + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + "deleteById": { + url: urlBase + "/topics/:id", + method: "DELETE", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#count + * @methodOf lbServices.Topic + * + * @description + * + * Count instances of the model matched by where from the data source. + * + * @param {Object=} parameters Request parameters. + * + * - `where` – `{object=}` - Criteria to match model instances + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * Data properties: + * + * - `count` – `{number=}` - + */ + "count": { + url: urlBase + "/topics/count", + method: "GET", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#prototype$updateAttributes + * @methodOf lbServices.Topic + * + * @description + * + * Update attributes for a model instance and persist it into the data source. + * + * @param {Object=} parameters Request parameters. + * + * - `id` – `{*}` - PersistedModel id + * + * @param {Object} postData Request data. + * + * This method expects a subset of model properties as request parameters. + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + "prototype$updateAttributes": { + url: urlBase + "/topics/:id", + method: "PUT", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#createChangeStream + * @methodOf lbServices.Topic + * + * @description + * + * Create a change stream. + * + * @param {Object=} parameters Request parameters. + * + * This method does not accept any parameters. + * Supply an empty object or omit this argument altogether. + * + * @param {Object} postData Request data. + * + * - `options` – `{object=}` - + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * Data properties: + * + * - `changes` – `{ReadableStream=}` - + */ + "createChangeStream": { + url: urlBase + "/topics/change-stream", + method: "POST", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#destroyData + * @methodOf lbServices.Topic + * + * @description + * + * Destroy all records. + * + * @param {Object=} parameters Request parameters. + * + * This method does not accept any parameters. + * Supply an empty object or omit this argument altogether. + * + * @param {Object} postData Request data. + * + * This method does not accept any data. Supply an empty object. + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + "destroyData": { + url: urlBase + "/topics/destroy-all", + method: "POST", + }, + + /** + * @ngdoc method + * @name lbServices.Topic#destroyByIds + * @methodOf lbServices.Topic + * + * @description + * + * Destroy selected records by id. + * + * @param {Object=} parameters Request parameters. + * + * This method does not accept any parameters. + * Supply an empty object or omit this argument altogether. + * + * @param {Object} postData Request data. + * + * This method expects a subset of model properties as request parameters. + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + "destroyByIds": { + url: urlBase + "/topics/destroy", + method: "POST", + }, + } + ); + + + + /** + * @ngdoc method + * @name lbServices.Topic#updateOrCreate + * @methodOf lbServices.Topic + * + * @description + * + * Update an existing model instance or insert a new one into the data source. + * + * @param {Object=} parameters Request parameters. + * + * This method does not accept any parameters. + * Supply an empty object or omit this argument altogether. + * + * @param {Object} postData Request data. + * + * This method expects a subset of model properties as request parameters. + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + R["updateOrCreate"] = R["upsert"]; + + /** + * @ngdoc method + * @name lbServices.Topic#update + * @methodOf lbServices.Topic + * + * @description + * + * Update instances of the model matched by where from the data source. + * + * @param {Object=} parameters Request parameters. + * + * - `where` – `{object=}` - Criteria to match model instances + * + * @param {Object} postData Request data. + * + * This method expects a subset of model properties as request parameters. + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * The number of instances updated + */ + R["update"] = R["updateAll"]; + + /** + * @ngdoc method + * @name lbServices.Topic#destroyById + * @methodOf lbServices.Topic + * + * @description + * + * Delete a model instance by id from the data source. + * + * @param {Object=} parameters Request parameters. + * + * - `id` – `{*}` - Model id + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + R["destroyById"] = R["deleteById"]; + + /** + * @ngdoc method + * @name lbServices.Topic#removeById + * @methodOf lbServices.Topic + * + * @description + * + * Delete a model instance by id from the data source. + * + * @param {Object=} parameters Request parameters. + * + * - `id` – `{*}` - Model id + * + * @param {function(Object,Object)=} successCb + * Success callback with two arguments: `value`, `responseHeaders`. + * + * @param {function(Object)=} errorCb Error callback with one argument: + * `httpResponse`. + * + * @returns {Object} An empty reference that will be + * populated with the actual data once the response is returned + * from the server. + * + * + * (The remote method definition does not provide any description. + * This usually means the response is a `Topic` object.) + * + */ + R["removeById"] = R["deleteById"]; + + + /** + * @ngdoc property + * @name lbServices.Topic#modelName + * @propertyOf lbServices.Topic + * @description + * The name of the model represented by this $resource, + * i.e. `Topic`. + */ + R.modelName = "Topic"; + + + return R; }]); diff --git a/client/js/topics.js b/client/js/topics.js new file mode 100644 index 0000000..b0fcc0d --- /dev/null +++ b/client/js/topics.js @@ -0,0 +1,350 @@ +var w = window.innerWidth * 0.75; +var h = Math.ceil(w * 0.7); +var oR = 0; +var nTop = 0; + +var svgContainer = d3.select("#mainBubble") + .style("height", h + "px"); + +var svg = d3.select("#mainBubble").append("svg") + .attr("class", "mainBubbleSVG") + .attr("width", w) + .attr("height", h) + .on("mouseleave", function() { + return resetBubbles(); + }); + +var mainNote = svg.append("text") + .attr("id", "bubbleItemNote") + .attr("x", 10) + .attr("y", w / 2 - 15) + .attr("font-size", 12) + .attr("dominant-baseline", "middle") + .attr("alignment-baseline", "middle") + .style("fill", "#888888") + .text(function(d) { + return ""; + }); + +function formatData(topics) { + console.info(topics) + + let parent = { + name: "bubble" + } + + let topic = topics[0], // just the first one for the demo + _topics = JSON.parse(topic.topics) + + parent.children = _topics.reduce((acc, curr) => { + + let o = { + name: curr.ex_words[0], + description: curr.ex_words, + weight: curr.weight + } + o.children = curr.ex_words.map(w => { + return { + name: w, + note: w + } + }) + + acc.push(o) + return acc + + }, []).filter(c => c.weight >= 0.05) + + return parent +} + +var searchparams = new URLSearchParams(window.location.search) +var thread = searchparams.get('a') + +function changeThread(el) { + searchparams.set("a", el.value) + window.location.search = searchparams.toString() +} + +document.getElementById('thread').value = thread + +d3.json(`/api/topics/?filter[limit]=1&filter[offset]=${thread}`, (err, docs) => { + if (err) + console.error(err); + + root = formatData(docs) + + // show tweets for the topic + $('#data').append([ + docs[0].all_text.slice(0,50).join('\n'), + '\n\n** Tokens **\n', + docs[0].tokens + ]); + + var bubbleObj = svg.selectAll(".topBubble") + .data(root.children) + .enter().append("g") + .attr("id", function(d, i) { + return "topBubbleAndText_" + i + }); + + nTop = root.children.length; + oR = w / (1 + 3 * nTop); + + h = Math.ceil(w / nTop * 2); + svgContainer.style("height", h + "px"); + + var colVals = d3.scale.category10(); + + bubbleObj.append("circle") + .attr("class", "topBubble") + .attr("id", function(d, i) { + return "topBubble" + i; + }) + .attr("r", function(d) { + return (d.weight * 2.3) * oR; + }) + .attr("cx", function(d, i) { + return oR * (3 * (1 + i) - 1); + }) + .attr("cy", (h + oR) / 3) + .style("fill", function(d, i) { + return colVals(i); + }) // #1f77b4 + .style("opacity", 0.3) + // .on("mouseover", function(d, i) { + // return activateBubble(d, i); + // }); + + bubbleObj.append("text") + .attr("class", "topBubbleText") + .attr("x", function(d, i) { + return oR * (3 * (1 + i) - 1); + }) + .attr("y", (h + oR) / 3) + .style("fill", function(d, i) { + return colVals(i); + }) // #1f77b4 + .attr("font-size", 30) + .attr("text-anchor", "middle") + .attr("dominant-baseline", "middle") + .attr("alignment-baseline", "middle") + .text(function(d) { + return d.name + }) + .on("mouseover", function(d, i) { + return activateBubble(d, i); + }); + + + for (var iB = 0; iB < nTop; iB++) { + var childBubbles = svg.selectAll(".childBubble" + iB) + .data(root.children[iB].children) + .enter().append("g"); + + //var nSubBubble = Math.floor(root.children[iB].children.length/2.0); + + childBubbles.append("circle") + .attr("class", "childBubble" + iB) + .attr("id", function(d, i) { + return "childBubble_" + iB + "sub_" + i; + }) + .attr("r", function(d) { + return oR / 3.0; + }) + .attr("cx", function(d, i) { + return (oR * (3 * (iB + 1) - 1) + oR * 1.5 * Math.cos((i - 1) * 45 / 180 * 3.1415926)); + }) + .attr("cy", function(d, i) { + return ((h + oR) / 3 + oR * 1.5 * Math.sin((i - 1) * 45 / 180 * 3.1415926)); + }) + .attr("cursor", "pointer") + .style("opacity", 0.5) + .style("fill", "#eee") + // .on("click", function(d, i) { + // window.open(d.address); + // }) + .on("mouseover", function(d, i) { + // var noteText = ""; + // if (d.note == null || d.note == "") { + // noteText = d.address; + // } else { + // noteText = d.note; + // } + // d3.select("#bubbleItemNote").text(noteText); + }) + .append("svg:title") + .text(function(d) { + return d.address; + }); + + childBubbles.append("text") + .attr("class", "childBubbleText" + iB) + .attr("x", function(d, i) { + return (oR * (3 * (iB + 1) - 1) + oR * 1.5 * Math.cos((i - 1) * 45 / 180 * 3.1415926)); + }) + .attr("y", function(d, i) { + return ((h + oR) / 3 + oR * 1.5 * Math.sin((i - 1) * 45 / 180 * 3.1415926)); + }) + .style("opacity", 0.5) + .attr("text-anchor", "middle") + .style("fill", function(d, i) { + return colVals(iB); + }) // #1f77b4 + .attr("font-size", 6) + .attr("cursor", "pointer") + .attr("dominant-baseline", "middle") + .attr("alignment-baseline", "middle") + .text(function(d) { + return d.name + }) + .on("click", function(d, i) { + // window.open(d.address); + }); + } +}); + +resetBubbles = function() { + // w = window.innerWidth * 0.75; + oR = w / (1 + 3 * nTop); + + h = Math.ceil(w / nTop * 2); + svgContainer.style("height", h + "px"); + + mainNote.attr("y", h - 15); + + svg.attr("width", w); + svg.attr("height", h); + + // d3.select("#bubbleItemNote").text("D3.js bubble menu developed by Shipeng Sun (sunsp.gis@gmail.com), Institute of Environment, University of Minnesota, and University of Springfield, Illinois."); + + var t = svg.transition() + .duration(650); + + t.selectAll(".topBubble") + .attr("r", function(d) { + return (d.weight * 2.3) * oR; + }) + .attr("cx", function(d, i) { + return oR * (3 * (1 + i) - 1); + }) + .attr("cy", (h + oR) / 3); + + t.selectAll(".topBubbleText") + .attr("font-size", 30) + .attr("x", function(d, i) { + return oR * (3 * (1 + i) - 1); + }) + .attr("y", (h + oR) / 3); + + for (var k = 0; k < nTop; k++) { + t.selectAll(".childBubbleText" + k) + .attr("x", function(d, i) { + return (oR * (3 * (k + 1) - 1) + oR * 1.5 * Math.cos((i - 1) * 45 / 180 * 3.1415926)); + }) + .attr("y", function(d, i) { + return ((h + oR) / 3 + oR * 1.5 * Math.sin((i - 1) * 45 / 180 * 3.1415926)); + }) + .attr("font-size", 6) + .style("opacity", 0.5); + + t.selectAll(".childBubble" + k) + .attr("r", function(d) { + return oR / 3.0; + }) + .style("opacity", 0.5) + .attr("cx", function(d, i) { + return (oR * (3 * (k + 1) - 1) + oR * 1.5 * Math.cos((i - 1) * 45 / 180 * 3.1415926)); + }) + .attr("cy", function(d, i) { + return ((h + oR) / 3 + oR * 1.5 * Math.sin((i - 1) * 45 / 180 * 3.1415926)); + }); + } +} + +function activateBubble(d, i) { + // increase this bubble and decrease others + var t = svg.transition() + .duration(d3.event.altKey ? 7500 : 350); + + t.selectAll(".topBubble") + .attr("cx", function(d, ii) { + if (i == ii) { + // Nothing to change + return oR * (3 * (1 + ii) - 1) - 0.6 * oR * (ii - 1); + } else { + // Push away a little bit + if (ii < i) { + // left side + return oR * 0.6 * (3 * (1 + ii) - 1); + } else { + // right side + return oR * (nTop * 3 + 1) - oR * 0.6 * (3 * (nTop - ii) - 1); + } + } + }) + .attr("r", function(d, ii) { + if (i == ii) + return oR * 1.8; + else + return oR * 0.8; + }); + + t.selectAll(".topBubbleText") + .attr("x", function(d, ii) { + if (i == ii) { + // Nothing to change + return oR * (3 * (1 + ii) - 1) - 0.6 * oR * (ii - 1); + } else { + // Push away a little bit + if (ii < i) { + // left side + return oR * 0.6 * (3 * (1 + ii) - 1); + } else { + // right side + return oR * (nTop * 3 + 1) - oR * 0.6 * (3 * (nTop - ii) - 1); + } + } + }) + .attr("font-size", function(d, ii) { + if (i == ii) + return 30 * 1.5; + else + return 30 * 0.6; + }); + + var signSide = -1; + for (var k = 0; k < nTop; k++) { + signSide = 1; + if (k < nTop / 2) signSide = 1; + t.selectAll(".childBubbleText" + k) + .attr("x", function(d, i) { + return (oR * (3 * (k + 1) - 1) - 0.6 * oR * (k - 1) + signSide * oR * 2.5 * Math.cos((i - 1) * 45 / 180 * 3.1415926)); + }) + .attr("y", function(d, i) { + return ((h + oR) / 3 + signSide * oR * 2.5 * Math.sin((i - 1) * 45 / 180 * 3.1415926)); + }) + .attr("font-size", function() { + return (k == i) ? 12 : 6; + }) + .style("opacity", function() { + return (k == i) ? 1 : 0; + }); + + t.selectAll(".childBubble" + k) + .attr("cx", function(d, i) { + return (oR * (3 * (k + 1) - 1) - 0.6 * oR * (k - 1) + signSide * oR * 2.5 * Math.cos((i - 1) * 45 / 180 * 3.1415926)); + }) + .attr("cy", function(d, i) { + return ((h + oR) / 3 + signSide * oR * 2.5 * Math.sin((i - 1) * 45 / 180 * 3.1415926)); + }) + .attr("r", function() { + return (k == i) ? (oR * 0.55) : (oR / 3.0); + }) + .style("opacity", function() { + return (k == i) ? 1 : 0; + }); + } +} + +window.onresize = resetBubbles; diff --git a/client/modules/core/controllers/events.ctrl.js b/client/modules/core/controllers/events.ctrl.js index 1e0d868..8267829 100644 --- a/client/modules/core/controllers/events.ctrl.js +++ b/client/modules/core/controllers/events.ctrl.js @@ -23,6 +23,28 @@ function EventsCtrl($scope, PostsCluster, SocialMediaPost, Event, $window, autho console.info(`${$window.location.origin}/api/charts/twitter/user-network?eventid=${evnt.id}`); }; + $scope.topicSelected = function(topic) { + // already selected + if ($scope.selectedTopic && $scope.selectedTopic.id === topic.id) + return; + + $scope.selectedTopic = topic; + $scope.hashtags = $scope.selectedTopic.top_hashtags; + + visualizeTopic(topic); + }; + + function visualizeTopic(topic) { + $scope.imageUrls = topic.image_urls; + + sampleSocialMediaPosts('image', topic.post_ids, 500) + .then(posts => { + $scope.posts = _(posts).orderBy(p => p.screen_name.toLowerCase()).value(); + createPostsCharts(posts); + }) + .catch(console.error); + } + $scope.eventNamed = function(evnt) { Event.prototype$updateAttributes({ id: evnt.id, @@ -208,7 +230,7 @@ function EventsCtrl($scope, PostsCluster, SocialMediaPost, Event, $window, autho }, forPosts() { - getPosts(clusters) + getPostsByClusters(clusters) .then(posts => { $scope.posts = _(posts).orderBy(p => p.screen_name.toLowerCase()).value(); createPostsCharts(posts); @@ -229,7 +251,7 @@ function EventsCtrl($scope, PostsCluster, SocialMediaPost, Event, $window, autho return functions; } - function getPosts(clusters) { + function getPostsByClusters(clusters) { let similarPostIds = _(clusters).map('similar_post_ids') .flatten().compact().uniq().value(); diff --git a/client/modules/core/directives/event-graph-directive.js b/client/modules/core/directives/event-graph-directive.js index 460cde0..588eee6 100644 --- a/client/modules/core/directives/event-graph-directive.js +++ b/client/modules/core/directives/event-graph-directive.js @@ -15,7 +15,7 @@ function eventGraphDirective() { } } -function eventGraphController($scope, Event, EventNetwork) { +function eventGraphController($scope, Event, EventNetwork, Topic) { // models primarily controlled by directive $scope.events = null; $scope.selectedDates = [0,0]; @@ -38,6 +38,25 @@ function eventGraphController($scope, Event, EventNetwork) { decorateSelectedEvents(); }; + $scope.getTopicsInRange = function(start, end) { + // use cached value if none given + start = start || $scope.selectedDates[0]; + end = end || $scope.selectedDates[1]; + Topic.find({ + // TODO: + filter: { + // where: { + + // } + order: 'cnt_post_ids desc' + } + }) + .$promise + .then(topics => $scope.selectedTopics = topics) + ; + + }; + // issue 'side' queries, to decorate events with additional attributes. // an alternative to creating many one-off event-related network calls. function decorateSelectedEvents(callback) { @@ -221,6 +240,7 @@ function eventGraphController($scope, Event, EventNetwork) { // TODO: better way to get selected values from chart? $scope.selectedDates = [start, end]; $scope.getEventsInRange(start, end); + $scope.getTopicsInRange(start, end); } navChart.append('g') diff --git a/client/modules/core/views/demo/topics.html b/client/modules/core/views/demo/topics.html new file mode 100644 index 0000000..2bee7b8 --- /dev/null +++ b/client/modules/core/views/demo/topics.html @@ -0,0 +1,76 @@ + + + + + + + + +

Topics by Twitter Reply threads

+ +
+ + + ** "most discussed" threads, at top (by # tweets) +
+ +
+
+ +
+

+  
+ + + + + + diff --git a/client/modules/core/views/pages/events.jade b/client/modules/core/views/pages/events.jade index 03f895e..eb7f60a 100644 --- a/client/modules/core/views/pages/events.jade +++ b/client/modules/core/views/pages/events.jade @@ -1,140 +1,223 @@ spinner(show='showSpinner') //- boolean scope property to toggle spinner -h3 Events +h3 Events & Topics h5 drag mouse over an area to select + .row(style='padding-bottom:20px') .col-md-12 .nav-chart-container(style='width:95%; height:160px') event-graph -.row - .col-md-3(style='height:550px; overflow-y:auto' ng-show='selectedEvents') - | Filter (supports regex on: tags, text, author, mentions) - form(ng-submit='filterChanged()') - .form-group + +.row(style='padding-bottom:20px') + .col-md-12 + ul.nav.nav-tabs(role='tablist') + li(role='presentation'): a(data-target='#tc-events' role='tab' data-toggle='tab') Events + li.active(role='presentation'): a(data-target='#tc-topics' role='tab' data-toggle='tab') Topics + + .tab-content + #tc-topics.tab-pane.active(role='tabpanel') .row - .col-md-10 - input.form-control(ng-model='filterText') - .col-md-2 - button.btn.btn-info(type='submit' style='float:right') - span.glyphicon.glyphicon-search + .col-md-3(style='height:550px; overflow-y:auto' ng-show='selectedTopics') + + table.table.table-striped + caption Topics ({{selectedTopics.length}}) + thead + tr + th topic + th # posts + th avg. weight + tbody + tr(ng-repeat='topic in selectedTopics') + td(ng-click='topicSelected(topic)' ng-class='{highlight: (selectedTopic && selectedTopic.id === topic.id)}'). + {{topic.topic}} + td {{topic.cnt_post_ids}} + td {{topic.avg_weight | number:2}} + + .col-md-4(style='height:550px; overflow-y:auto' ng-show='selectedTopic') + .row(ng-show='hashtags | isDefined' style='width:100%; height:100px; overflow-y:auto; font-size:1.5em') + |# + div(ng-repeat='ht in hashtags' style='display:inline-block'). + {{ ht }}{{$last ? '' : ','}}  + .row(ng-show='imageUrls' style='width:100%; height:400px; overflow-y:auto') + image-grid(hover-dir='left') + + .col-md-5(style='height:550px; overflow-y:auto' ng-show='selectedTopic') + ul.nav.nav-tabs(role='tablist') + li.active(role='presentation'): a(data-target='#tc-post-types2' role='tab' data-toggle='tab') Post types + li(role='presentation'): a(data-target='#tc-posts2' role='tab' data-toggle='tab') Posts + li(role='presentation'): a(data-target='#tc-refs2' role='tab' data-toggle='tab') Most referenced - table.table.table-striped - caption Events - tbody - tr(ng-repeat='evt in selectedEvents') - td(ng-click='eventSelected(evt)' ng-class='{highlight: (selectedEvent && selectedEvent.id === evt.id)}') - .row - .cell - span(style="font-weight:bold" title="This event has hashtags" ng-show="evt.hashtags | isDefined") # - span(style="font-weight:bold") {{evt.hashtags | sampleWeightedPairs}} - .cell - span.glyphicon.glyphicon-picture(title="This event has images" ng-show="evt.image_urls | isDefined") - .cell - span.glyphicon.glyphicon-map-marker(title="This event has locations" ng-show="evt.location | isDefined") - .cell - span.glyphicon.glyphicon-user(title="This event has a user network" ng-show="evt.has_user_network") - .row - .cell - p(style='display:inline') {{evt.start_time_ms | date:'short'}} - {{evt.end_time_ms | date:'short'}} - input.form-control(style='display:inline;width:100px;margin-left:16px' ng-model='evt.name' ng-model-options='{ debounce: 1000 }' ng-change='eventNamed(evt)') - .row - .cell - label Interesting: - .cell - label.switch - input(type='checkbox' ng-model='evt.of_interest' ng-change='ofInterestChanged(evt)') - .slider.round + .tab-content + #tc-post-types2.tab-pane.active(role='tabpanel') + .col-md-6 + mark Posts ({{posts.length}}) + pie-chart(content='{{postTypeCounts}}' conf='{{::postTypeConf}}') + .col-md-6 + table.table.table-condensed + caption counts (sampled) + tbody + tr(ng-repeat='c in postTypeCounts') + td {{c.label}} + td {{c.value}} + table.table.table-condensed(style='margin-top:-8px') + caption aggregates + tbody + tr(ng-repeat='c in aggCounts') + td {{c.label}} + td {{c.value}} + #tc-posts2.tab-pane(role='tabpanel') + table.table.table-striped.table-condensed + caption Posts ({{posts.length}}) + tbody + tr(ng-repeat='post in posts') + td + a(ng-click='loadAuthorUrl(post)') + | {{post.screen_name}} + br + img(ng-src='{{post.author_image_url}}' style='border:0; width:48px; height:48px') + td(ng-class='{highlight: (selectedImageUrl && selectedImageUrl === post.primary_image_url)}') {{post.text}} + td: a(ng-href='{{post.post_url}}' target='_blank') visit + #tc-refs2.tab-pane(role='tabpanel') + table.table.table-condensed(style='margin-top:-8px') + caption Retweeted links + tbody + tr(ng-repeat='c in broadcastCounts') + td: a(ng-href='{{c.label | statusUrl}}' target='_blank') {{c.label}} + td {{c.value}} - .col-md-4(style='height:550px; overflow-y:hidden') - event-map(ng-show='mapPoints | isDefined' points='mapPoints') - iframe#map-frame(src='http://54.86.248.86:3005/index.html' style='width:100%; height:600px' scrolling='no' frameborder='0') - .col-md-5(ng-show='selectedEvent') - .row(ng-show='imageUrls' style='width:100%; height:120px; overflow-y:auto') - image-grid(hover-dir='left') - .row(ng-show='hashtags | isDefined' style='width:100%; height:100px; overflow-y:auto; font-size:1.5em') - |# - div(ng-repeat='ht in hashtags' style='display:inline-block') - translate(phrase='{{ht[0]}}') - | ({{ht[1]}}){{$last ? '' : ','}}  - .row(style='width:100%; height:280px; overflow-y:auto') - ul.nav.nav-tabs(role='tablist') - li.active(role='presentation'): a(data-target='#tc-post-types' role='tab' data-toggle='tab') Post types - li(role='presentation'): a(data-target='#tc-posts' role='tab' data-toggle='tab') Posts - li(role='presentation'): a(data-target='#tc-refs' role='tab' data-toggle='tab') Most referenced - li(role='presentation'): a(data-target='#tc-user-network' role='tab' data-toggle='tab') User network - .tab-content - #tc-post-types.tab-pane.active(role='tabpanel') - .col-md-6 - mark Posts ({{posts.length}}) - pie-chart(content='{{postTypeCounts}}' conf='{{::postTypeConf}}') - .col-md-6 - table.table.table-condensed - caption counts (sampled) + #tc-events.tab-pane(role='tabpanel') + + .row + .col-md-3(style='height:550px; overflow-y:auto' ng-show='selectedEvents') + | Filter (supports regex on: tags, text, author, mentions) + form(ng-submit='filterChanged()') + .form-group + .row + .col-md-10 + input.form-control(ng-model='filterText') + .col-md-2 + button.btn.btn-info(type='submit' style='float:right') + span.glyphicon.glyphicon-search + + table.table.table-striped + caption Events + tbody + tr(ng-repeat='evt in selectedEvents') + td(ng-click='eventSelected(evt)' ng-class='{highlight: (selectedEvent && selectedEvent.id === evt.id)}') + .row + .cell + span(style="font-weight:bold" title="This event has hashtags" ng-show="evt.hashtags | isDefined") # + span(style="font-weight:bold") {{evt.hashtags | sampleWeightedPairs}} + .cell + span.glyphicon.glyphicon-picture(title="This event has images" ng-show="evt.image_urls | isDefined") + .cell + span.glyphicon.glyphicon-map-marker(title="This event has locations" ng-show="evt.location | isDefined") + .cell + span.glyphicon.glyphicon-user(title="This event has a user network" ng-show="evt.has_user_network") + .row + .cell + p(style='display:inline') {{evt.start_time_ms | date:'short'}} - {{evt.end_time_ms | date:'short'}} + input.form-control(style='display:inline;width:100px;margin-left:16px' ng-model='evt.name' ng-model-options='{ debounce: 1000 }' ng-change='eventNamed(evt)') + .row + .cell + label Interesting: + .cell + label.switch + input(type='checkbox' ng-model='evt.of_interest' ng-change='ofInterestChanged(evt)') + .slider.round + + + .col-md-4(style='height:550px; overflow-y:hidden') + event-map(ng-show='mapPoints | isDefined' points='mapPoints') + iframe#map-frame(src='http://54.86.248.86:3005/index.html' style='width:100%; height:600px' scrolling='no' frameborder='0') + .col-md-5(ng-show='selectedEvent') + .row(ng-show='imageUrls' style='width:100%; height:120px; overflow-y:auto') + image-grid(hover-dir='left') + .row(ng-show='hashtags | isDefined' style='width:100%; height:100px; overflow-y:auto; font-size:1.5em') + |# + div(ng-repeat='ht in hashtags' style='display:inline-block') + translate(phrase='{{ht[0]}}') + | ({{ht[1]}}){{$last ? '' : ','}}  + .row(style='width:100%; height:280px; overflow-y:auto') + ul.nav.nav-tabs(role='tablist') + li.active(role='presentation'): a(data-target='#tc-post-types' role='tab' data-toggle='tab') Post types + li(role='presentation'): a(data-target='#tc-posts' role='tab' data-toggle='tab') Posts + li(role='presentation'): a(data-target='#tc-refs' role='tab' data-toggle='tab') Most referenced + li(role='presentation'): a(data-target='#tc-user-network' role='tab' data-toggle='tab') User network + + .tab-content + #tc-post-types.tab-pane.active(role='tabpanel') + .col-md-6 + mark Posts ({{posts.length}}) + pie-chart(content='{{postTypeCounts}}' conf='{{::postTypeConf}}') + .col-md-6 + table.table.table-condensed + caption counts (sampled) + tbody + tr(ng-repeat='c in postTypeCounts') + td {{c.label}} + td {{c.value}} + table.table.table-condensed(style='margin-top:-8px') + caption aggregates + tbody + tr(ng-repeat='c in aggCounts') + td {{c.label}} + td {{c.value}} + #tc-posts.tab-pane(role='tabpanel') + table.table.table-striped.table-condensed + caption Posts ({{posts.length}}) + tbody + tr(ng-repeat='post in posts') + td + a(ng-click='loadAuthorUrl(post)') + | {{post.screen_name}} + br + img(ng-src='{{post.author_image_url}}' style='border:0; width:48px; height:48px') + td(ng-class='{highlight: (selectedImageUrl && selectedImageUrl === post.primary_image_url)}') {{post.text}} + td: a(ng-href='{{post.post_url}}' target='_blank') visit + #tc-refs.tab-pane(role='tabpanel') + table.table.table-condensed(style='margin-top:-8px') + caption Retweeted links + tbody + tr(ng-repeat='c in broadcastCounts') + td: a(ng-href='{{c.label | statusUrl}}' target='_blank') {{c.label}} + td {{c.value}} + #tc-user-network.tab-pane(role='tabpanel') + .chart-container(style='width:100%; height:300px') + user-network-graph + .row(ng-show='selectedEvent') + .col-md-4(style='height:400px; overflow-y:auto') + table.table.table-striped + caption Keywords + thead + tr + th word + th count + tbody + tr(ng-repeat='keyword in keywords') + td + translate(phrase='{{keyword[0]}}') + td {{keyword[1]}} + .col-md-4(style='height:400px; overflow-y:auto') + table.table.table-striped + caption Locations + thead + tr + th loc + th weight tbody - tr(ng-repeat='c in postTypeCounts') - td {{c.label}} - td {{c.value}} - table.table.table-condensed(style='margin-top:-8px') - caption aggregates + tr(ng-repeat='loc in locations') + td: translate(phrase='{{loc.label}}') + td {{loc.weight | number:1}} + .col-md-4(ng-show='authorPosts' style='height:400px; overflow-y:auto') + table.table.table-striped + caption + a(ng-click='loadAuthorUrl(authorPosts[0])') + img(ng-src='{{authorPosts[0].author_image_url}}' style='border:0; width:48px; height:48px') + | All {{authorPosts[0].screen_name}} Posts ({{authorPosts.length}}) tbody - tr(ng-repeat='c in aggCounts') - td {{c.label}} - td {{c.value}} - #tc-posts.tab-pane(role='tabpanel') - table.table.table-striped.table-condensed - caption Posts ({{posts.length}}) - tbody - tr(ng-repeat='post in posts') - td - a(ng-click='loadAuthorUrl(post)') - | {{post.screen_name}} - br - img(ng-src='{{post.author_image_url}}' style='border:0; width:48px; height:48px') - td(ng-class='{highlight: (selectedImageUrl && selectedImageUrl === post.primary_image_url)}') {{post.text}} - td: a(ng-href='{{post.post_url}}' target='_blank') visit - #tc-refs.tab-pane(role='tabpanel') - table.table.table-condensed(style='margin-top:-8px') - caption Retweeted links - tbody - tr(ng-repeat='c in broadcastCounts') - td: a(ng-href='{{c.label | statusUrl}}' target='_blank') {{c.label}} - td {{c.value}} - #tc-user-network.tab-pane(role='tabpanel') - .chart-container(style='width:100%; height:300px') - user-network-graph -.row(ng-show='selectedEvent') - .col-md-4(style='height:400px; overflow-y:auto') - table.table.table-striped - caption Keywords - thead - tr - th word - th count - tbody - tr(ng-repeat='keyword in keywords') - td - translate(phrase='{{keyword[0]}}') - td {{keyword[1]}} - .col-md-4(style='height:400px; overflow-y:auto') - table.table.table-striped - caption Locations - thead - tr - th loc - th weight - tbody - tr(ng-repeat='loc in locations') - td: translate(phrase='{{loc.label}}') - td {{loc.weight | number:1}} - .col-md-4(ng-show='authorPosts' style='height:400px; overflow-y:auto') - table.table.table-striped - caption - a(ng-click='loadAuthorUrl(authorPosts[0])') - img(ng-src='{{authorPosts[0].author_image_url}}' style='border:0; width:48px; height:48px') - | All {{authorPosts[0].screen_name}} Posts ({{authorPosts.length}}) - tbody - tr(ng-repeat='post in authorPosts') - td: a(ng-href='{{post.post_url}}' target='_blank') visit - td {{post.text}} - td {{post.timestamp_ms | date:'short'}} + tr(ng-repeat='post in authorPosts') + td: a(ng-href='{{post.post_url}}' target='_blank') visit + td {{post.text}} + td {{post.timestamp_ms | date:'short'}} diff --git a/common/models/job.js b/common/models/job.js index 71f6b5c..63c7d07 100644 --- a/common/models/job.js +++ b/common/models/job.js @@ -1,7 +1,8 @@ 'use strict'; const redis = require('../../lib/redis'), - _ = require('lodash'); + _ = require('lodash'), + idGen = require('../../server/util/id-generator'); module.exports = function(Job) { @@ -32,4 +33,37 @@ module.exports = function(Job) { return redis.hgetall(jobId) .then(job => _.pick(job, ['state', 'data', 'error'])); }; + + Job.remoteMethod( + 'submit', + { + description: 'Submit a background job for processing', + accepts: { + arg: 'args', + type: 'object', + description: 'object with properties "job_type" + misc. job attrs', + required: true, + http: { source: 'body' } + }, + returns: {type: 'object', root: true}, + http: {path: '/submit', verb: 'post'} + } + ); + + // submit a job for processing. + // args: redis job attrs. + // job_type is a redis list being watched. + Job.submit = function(args, cb) { + const key = idGen.randomish(), + listName = args.job_type; + + delete args.job_type; + + // services expect 'new' state. + args.state = 'new'; + + redis.enqueue(key, args, listName) + .then(() => cb(null, { job_id: key })) + .catch(err => cb(err)); + }; }; diff --git a/common/models/topic.js b/common/models/topic.js new file mode 100644 index 0000000..b14d18c --- /dev/null +++ b/common/models/topic.js @@ -0,0 +1,5 @@ +'use strict'; + +module.exports = function(Topic) { + +}; diff --git a/common/models/topic.json b/common/models/topic.json new file mode 100644 index 0000000..cbabef1 --- /dev/null +++ b/common/models/topic.json @@ -0,0 +1,22 @@ +{ + "name": "topic", + "description": "Learned topics from smposts", + "base": "PersistedModel", + "idInjection": true, + "indexes": { + }, + "options": { + "validateUpsert": true + }, + "mixins": { + "Timestamp": {}, + "DestroyAll": {} + }, + "properties": { + + }, + "validations": [], + "relations": {}, + "acls": [], + "methods": {} +} diff --git a/deploy/compose/services.yml b/deploy/compose/services.yml index 8fe6bbf..2854583 100644 --- a/deploy/compose/services.yml +++ b/deploy/compose/services.yml @@ -116,6 +116,28 @@ services: ports: - "3005:3000" + # event-classifier: + # image: sotera/event-classifier:3 + # hostname: event-classifier + # environment: + # - SPARK_EXECUTOR_MEMORY=10g + # - SPARK_DRIVER_MEMORY=10g + # depends_on: + # - redis + # - mongo + + # silk-specter: + # image: sotera/silk-specter:8 + # hostname: silk-specter + # environment: + # - SPARK_EXECUTOR_MEMORY=30g + # - SPARK_DRIVER_MEMORY=30g + # - DB_HOST=mongo + # - DB_PORT=27017 + # depends_on: + # - redis + # - mongo + dr-manhattan: extends: file: common.yml diff --git a/deploy/docker/pyspark-mongo-jupyter/docker-compose.notebook.yml b/deploy/docker/pyspark-mongo-jupyter/docker-compose.notebook.yml index 54da883..6e858dc 100644 --- a/deploy/docker/pyspark-mongo-jupyter/docker-compose.notebook.yml +++ b/deploy/docker/pyspark-mongo-jupyter/docker-compose.notebook.yml @@ -1,11 +1,14 @@ version: '2.1' # runs jupyter on localhost:8888 by default. -# runs in either spark local or standalone cluster mode. +# can run in either spark local or standalone cluster mode. # i.e. # SparkSession.builder.master("spark://master:7077").getOrCreate() # SparkSession.builder.master("local[*]").getOrCreate() +# in either mode, localhost:4040 is the Spark app UI. +# in cluster mode, localhost:9090 is the master UI. + networks: watchman: # connect to existing watchman network @@ -18,8 +21,9 @@ services: - watchman image: sotera/pyspark-mongo-jupyter:4 ports: + - "4040:4040" + - "8080:8080" - "8888:8888" - - "6006:6006" environment: GRANT_SUDO: "yes" extends: @@ -46,10 +50,10 @@ services: - 7077 - 6066 ports: - - "4040:4040" + - "5050:4040" - "6066:6066" - "7077:7077" - - "8080:8080" + - "9090:8080" extends: file: common.notebook.yml service: pyspark diff --git a/script/deploy/compose b/script/deploy/compose index 6c004c4..229fed3 100755 --- a/script/deploy/compose +++ b/script/deploy/compose @@ -27,9 +27,9 @@ case "$1" in ;; up) - docker-compose $compose_files up -d "${@:2}" - if [ "$2" == "deploy" ]; then + # make sure svcs are up + docker-compose $compose_files up -d curr_branch=`git rev-parse --abbrev-ref HEAD` if [ "$3" == "local" ]; then echo $"⇨ Deploying from LOCAL branch: $curr_branch" @@ -42,6 +42,9 @@ case "$1" in script/deploy/slc-deploy.js git checkout $curr_branch fi + else + # pass-thru + docker-compose $compose_files up -d "${@:2}" fi ;; diff --git a/script/docker/get-tags.sh b/script/docker/get-tags.sh index 8a79d18..09a8e92 100755 --- a/script/docker/get-tags.sh +++ b/script/docker/get-tags.sh @@ -8,7 +8,7 @@ if ! [ -e "$jq" ]; then fi for name in 'caffe_redis_client' 'comedian' 'dr-manhattan' \ -'feature-sim' 'image-fetcher' 'ozymandias' 'rorschach' +'feature-sim' 'image-fetcher' 'ozymandias' 'rorschach' 'silk-specter' do curl -s https://registry.hub.docker.com/v2/repositories/sotera/$name/tags/ | jq --arg name $name '."results"[0] | {name: [$name]} + {last_tag: .name, updated: .last_updated}' diff --git a/script/docker/start-pyspark.sh b/script/docker/start-pyspark.sh index bab3518..aa288ed 100755 --- a/script/docker/start-pyspark.sh +++ b/script/docker/start-pyspark.sh @@ -6,7 +6,7 @@ svc="pyspark-nb" # if "cluster", start master, worker containers. if [ "$1" == "cluster" ]; then svc="" - echo "** You can run spark in local or cluster mode (spark://master:7077) **" + echo "** You can run spark in local (local[*]) or cluster mode (spark://master:7077) **" else echo "** You can run spark in local mode only **" fi diff --git a/server/boot/ui-routes.js b/server/boot/ui-routes.js index 602ac39..5651d41 100644 --- a/server/boot/ui-routes.js +++ b/server/boot/ui-routes.js @@ -20,6 +20,13 @@ module.exports = function(server) { }); // demo pages + router.get('/demo/topics*', function(req, res, next) { + const filePath = path.resolve( + path.join(server.get('views'), viewsPath, 'demo/topics.html') + ); + res.sendFile(filePath); + }); + router.get('/demo/*?', function(req, res, next) { res.render(path.join(viewsPath, 'demo', req.params[0])); }); diff --git a/server/model-config.json b/server/model-config.json index 0d45320..2aca896 100644 --- a/server/model-config.json +++ b/server/model-config.json @@ -111,5 +111,9 @@ "eventNetwork": { "dataSource": "db", "public": true + }, + "topic": { + "dataSource": "db", + "public": true } } diff --git a/server/util/id-generator.js b/server/util/id-generator.js index 04ee225..5d5f1aa 100644 --- a/server/util/id-generator.js +++ b/server/util/id-generator.js @@ -1,5 +1,7 @@ 'use strict'; +// def: generate id's for jobs, etc. + module.exports = { // when random isn't uber important randomish(min, max) { diff --git a/services/event-classifier/classifier.py b/services/event-classifier/classifier.py new file mode 100644 index 0000000..d0e4c9a --- /dev/null +++ b/services/event-classifier/classifier.py @@ -0,0 +1,141 @@ +from sklearn.externals import joblib +import pyspark.sql.functions as F +from pyspark.sql.types import * +from os import getenv +import sys, os +sys.path.append(os.path.join(os.path.dirname(__file__), '../util')) +from mongo_spark_client import Client as SparkClient + +IMG_MIN = 3 # min image count + +# model: serialized obj that responds to predict() +# TODO: broadcast to cluster? +model = joblib.load(getenv('MODEL_PATH', '/usr/src/app/bagging_clf.pkl')) + +class Classifier(object): + def __init__(self): + self.spark = SparkClient(master='localhost[*]') + + # event_ids: comma-separated ids, like "123,456,789" + def run(self, event_ids): + event_ids = event_ids.replace(' ', '').split(',') + + self.spark.collection = 'event' + + orig_events = self.spark.read()\ + .where(F.col('_id').isin(event_ids)) + + events = orig_events\ + .select('_id', 'name', 'of_interest', 'location', + 'keywords', 'hashtags', 'image_urls', + F.explode('cluster_ids').alias('cluster_id')) + + #### F.size() triggers an action (vs. transform) + #### so lets filter it by cluster_ids right away. + cluster_ids = list(events.toPandas()['cluster_id'].values) + clusters = self._readPostsClusters(cluster_ids) + + events_clusters = events.join(clusters, + events['cluster_id'] == clusters['_id']) + + events_posts_cnt = events_clusters\ + .groupby(events['_id'])\ + .agg(F.sum('posts_cnt').alias('all_posts_cnt')) + + agg_events1 = events_clusters\ + .groupby(events['_id'], + 'of_interest', 'data_type', 'location', 'keywords', + 'hashtags', 'image_urls')\ + .agg(F.sum('posts_cnt').alias('dt_posts_cnt')) + + agg_events2 = agg_events1\ + .join(events_posts_cnt, agg_events1['_id'] == events_posts_cnt['_id'])\ + .drop(events_posts_cnt['_id'])\ + .withColumn('text_wt', + F.when(agg_events1['data_type'] == 'text', + F.col('dt_posts_cnt')/F.col('all_posts_cnt')).otherwise(0))\ + .withColumn('image_wt', + F.when(agg_events1['data_type'] == 'image', + F.col('dt_posts_cnt')/F.col('all_posts_cnt')).otherwise(0))\ + .withColumn('hashtag_wt', + F.when(agg_events1['data_type'] == 'hashtag', + F.col('dt_posts_cnt')/F.col('all_posts_cnt')).otherwise(0)) + + agg_events3 = agg_events2\ + .groupby('_id', 'of_interest', 'location', 'keywords', 'hashtags', + 'image_urls')\ + .agg(F.sum('all_posts_cnt'), F.sum('text_wt'), F.sum('image_wt'), + F.sum('hashtag_wt')) + + df_classify = agg_events3\ + .withColumn('tmp_classify', ( + u_classify( + F.col('hashtags'), F.col('location'), F.col('sum(hashtag_wt)'), + F.col('sum(text_wt)'), F.col('sum(image_wt)'))).cast('boolean') + )\ + .drop('of_interest')\ + .drop('sum(text_wt)')\ + .drop('sum(image_wt)')\ + .drop('sum(hashtag_wt)')\ + .drop('sum(all_posts_cnt)')\ + .drop('location')\ + .drop('keywords')\ + .drop('hashtags')\ + .drop('image_urls')\ + .withColumnRenamed('tmp_classify', 'of_interest') + + df_final = df_classify\ + .join(orig_events, df_classify['_id'] == orig_events['_id'])\ + .drop(df_classify['_id'])\ + .drop(orig_events['of_interest']) + + df_final.show() + + self.spark.write(df_final) + + def _readPostsClusters(self, cluster_ids): + # change collection temporarily for this query + self.spark.collection = 'postsCluster' + + #### F.size() triggers an action (vs. transform) + #### so lets filter it by cluster_ids right away. + clusters = self.spark.read()\ + .select('_id', 'similar_ids', 'data_type', + F.size('similar_ids').alias('posts_cnt'))\ + .where(F.col('_id').isin(cluster_ids)) + + ### Reset to event + self.spark.collection = 'event' + + return clusters + +def classify(hashtags, locations, hashtag_wt, text_wt, image_wt): + X = [[hashtag_wt, text_wt, image_wt, score_ht(hashtags), + score_loc(locations)]] + pred = model.predict(X) + return str(pred[0]) + +u_classify = F.udf(classify) + +def score_loc(locations): + if not (locations and len(locations)): return 0 + if not locations[0]: return 0 + top_wt = float(locations[0].weight) + tot = sum(map(lambda loc: float(loc.weight), locations)) + if not tot: return 0 + return top_wt/tot + +def score_ht(hashtags): + if not (hashtags and len(hashtags)): return 0 + top_wt = int(hashtags[0][1]) + tot = sum(map(lambda k: int(k[1]), hashtags)) + if not tot: return 0 + return top_wt/tot + +def score_img(urls): + if not (urls and len(urls)): return 0 + if len(urls) >= IMG_MIN: + return 1 + else: + return 0 + diff --git a/services/event-classifier/main.py b/services/event-classifier/main.py new file mode 100644 index 0000000..f4626be --- /dev/null +++ b/services/event-classifier/main.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +import sys, os +sys.path.append(os.path.join(os.path.dirname(__file__), '../util')) +from redis_dispatcher import Dispatcher +from classifier import Classifier + +def set_err(job, msg): + job['state'] = 'error' + job['data'] = [] + job['error'] = msg + +def err_check(job): + required = {'event_ids'} + if not required.issubset(job): + set_err(job, 'Missing some required fields {}'.format(required)) + +def process_message(key, job): + err_check(job) + if job['state'] == 'error': + return + + # more stable if new instance for each job. + # TODO: singleton instead + clf = Classifier() + + clf.run(job['event_ids']) + + # nothing to save + job['data'] = [] + job['state'] = 'processed' + +dispatcher = Dispatcher(redis_host='redis', + process_func=process_message, + queues=['genie:event_classifier']) +dispatcher.start() diff --git a/services/event-classifier/requirements.txt b/services/event-classifier/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/services/silk-specter/courier.py b/services/silk-specter/courier.py new file mode 100644 index 0000000..7591412 --- /dev/null +++ b/services/silk-specter/courier.py @@ -0,0 +1,36 @@ +# def: send data to other systems. + +from kafka import KafkaProducer +from kafka.errors import KafkaError, KafkaTimeoutError +import traceback, json + +# re-use producer instance. +producer = None + +def deliver(topic, kafka_url='print', kafka_topic='print'): + if kafka_url == 'print': + print('kafka_url:', kafka_url) + print(topic) + return + + global producer + producer = producer or KafkaProducer(bootstrap_servers=kafka_url, + value_serializer=lambda v: json.dumps(v).encode('utf-8')) + + print('to kafka:', topic) + try: + state = producer.send(kafka_topic, to_qcr_format(topic)) + record_metadata = state.get(timeout=10) + print(record_metadata.topic) + print(record_metadata.partition) + print(record_metadata.offset) + except KafkaError as err: + traceback.print_exc() + except KafkaTimeoutError as err: + traceback.print_exc() + + +def to_qcr_format(topic): + topic.pop('created', None) # datetime not serializable + topic.pop('post_ids', None) + return topic diff --git a/services/silk-specter/fast_text_modeler.py b/services/silk-specter/fast_text_modeler.py new file mode 100644 index 0000000..e18516f --- /dev/null +++ b/services/silk-specter/fast_text_modeler.py @@ -0,0 +1,256 @@ +import pyspark.sql.functions as F +from pyspark.sql.types import * +import sys, os, json +from datetime import datetime +sys.path.append(os.path.join(os.path.dirname(__file__), '../util')) +from mongo_spark_client import Client as SparkClient +from tokenizer import pres_tokenize +import date_utils as dtu +from courier import deliver +import fasttext + +# common twitter hashtags. +COMMON_TAGS = ['breaking', 'news', 'breakingnews', 'foxnews', 'job', 'jobs', + 'hiring', 'careerarc'] + +class Model(object): + def __init__(self): + db_host = os.getenv('DB_HOST', 'mongo') + db_port = os.getenv('DB_PORT', 27017) + uri = 'mongodb://{}:{}'.format(db_host, db_port) + print('db conf:', uri) + self.spark = SparkClient(uri=uri) + self.counts = {} # for reporting + + # times in ms. + def train(self, start_time=0, end_time=0): + print('train range: {} to {}'.format(dtu.dt_from_ms(start_time), dtu.dt_from_ms(end_time))) + df = self.query_labeled_posts(start_time, end_time) + + rdd_hash = df.rdd + + # flatten, count, chop the long tail, sort + hashtags = rdd_hash\ + .flatMap(lambda x: x.hashtags)\ + .map(lambda x: (x.lower(), 1))\ + .reduceByKey(lambda x, y: x+y)\ + .filter(lambda x: x[1] > 2)\ + .sortBy(lambda x: x[1], False)\ + .collect() + + top_hashtags = hashtags[: int(len(hashtags) * .01)] + print('top x% hashtags', top_hashtags) + labels = list(map(lambda x: x[0], top_hashtags)) + self.labels = labels + bc_labels = self.spark.sparkContext.broadcast(labels) + + # keep tweets with a top hashtag, and format data for training: + # list of pairs: ( [label1, label2, ...], text ) + all_hash = rdd_hash\ + .filter(lambda x: len(set(map(lambda x: x.lower(), x.hashtags)) & set(bc_labels.value)))\ + .map(lambda x: (list(set(map(lambda x: x.lower(), x.hashtags)) & set(bc_labels.value)), x.text)) + + tweets = all_hash.collect() + + train_file, test_file = '/tmp/tweet_data_train.txt', '/tmp/tweet_data_test.txt' + + # train-test split, multiple labels per line. + fo, fo2 = open(train_file, 'w'), open(test_file, 'w') + i = 0 + for tweet in tweets: + cleaned = clean_text(tweet[1]) + i+=1 + if i%10==0: # to test + for htag in tweet[0]: + _=fo2.write("__label__{} ".format(htag)) + _=fo2.write("{}\n".format(cleaned)) + else: # to train + for htag in tweet[0]: + _=fo.write("__label__{} ".format(htag)) + _=fo.write("{}\n".format(cleaned)) + + fo.close() + fo2.close() + + # epoch improves precision at least on smallish sets. make it a variable? + self.classifier = fasttext.supervised(train_file, '/tmp/model', epoch=35) + + self.analyze_model(test_file) + + return self.classifier + + # times in ms. + def predict(self, start_time=0, end_time=0, kafka_url='print', kafka_topic='print'): + print('predict range: {} to {}'.format(dtu.dt_from_ms(start_time), dtu.dt_from_ms(end_time))) + df_posts = self.query_unlabeled_posts(start_time, end_time)\ + .withColumn('fasttext_in', u_clean_text(F.col('text'))) + + def ft_model(text): + try: + pred = self.classifier.predict_proba([text])[0][0] + topics = dict( + topic=pred[0], + weight=pred[1] + ) + return json.dumps(topics) + except: + return '' + + # fasttext clf doesn't play nicely with spark so collect erthing + # and apply in pandas. + df_ft = df_posts.toPandas() + df_ft['ft_topics'] = df_ft['fasttext_in'].apply(ft_model) + + df_ft2 = self.spark.spark.createDataFrame(df_ft) + df_ft2 = df_ft2.select('*', + F.json_tuple(df_ft2.ft_topics, 'topic', 'weight').alias('topic', 'weight') + ) + + # rm low probab predictions. + df_ft2 = df_ft2.where(df_ft2.weight > 0.6) + + df_ft3 = df_ft2.select('*', F.explode('campaigns').alias('camp_id')) + + df_topics=df_ft3\ + .groupby('topic', 'camp_id')\ + .agg( + F.collect_list('post_id').alias('post_ids'), + F.count('post_id').alias('cnt_post_ids'), + F.collect_list('hashtags').alias('all_hashtags'), + F.mean('weight').alias('avg_weight') + )\ + .sort('cnt_post_ids', ascending=False) + + # def add_hashtags(_): + # return self.labels + + # u_add_hashtags = F.udf(add_hashtags, ArrayType(StringType())) + + df_topics = df_topics\ + .withColumn('hashtags', u_flatten('all_hashtags'))\ + .drop('all_hashtags') + # .withColumn('top_hashtags', u_add_hashtags('topic')) # hack to add literal array for each row + + df_topics = df_topics\ + .withColumn('_post_ids', u_trunc_array('post_ids')) + + df_topics = df_topics\ + .drop('post_ids')\ + .withColumnRenamed('_post_ids', 'post_ids') + + df_topics = df_topics.select('*', + F.lit(datetime.now()).alias('created'), + F.lit(start_time).alias('start_time'), + F.lit(end_time).alias('end_time') + ) + + self.counts['topics*campaigns'] = df_topics.count() + + # toLocalIterator has bug: https://issues.apache.org/jira/browse/SPARK-18281 + # topics_iter = df_topics.toLocalIterator() + topics = list(map(lambda s: json.loads(s), df_topics.toJSON().collect())) + + # for row in topics_iter: + for row in topics: + deliver(row, kafka_url, kafka_topic) + + self.save(df_topics) + + self.report_stats() + + self.spark.stop() + + def analyze_model(self, test_file): + result = self.classifier.test(test_file) + print('########################################') + print('############ MODEL ANALYSIS ############') + print('P@1:', result.precision) + print('R@1:', result.recall) + print('# of examples:', result.nexamples) + print('# of labels:', len(self.labels)) + print('dataset:', self.counts) + print('########################################') + + def report_stats(self): + print('########################################') + print('################ STATS #################') + print('dataset:', self.counts) + print('########################################') + + def save(self, df): + self.spark.collection = 'topic' + self.spark.write(df) + + # posts used to train. + # times in ms. + def query_labeled_posts(self, start_time=0, end_time=0): + df = self.query_posts(start_time, end_time) + + df_retweets = df\ + .where('broadcast_post_id is not null') + + df_no_retweets = df\ + .where('broadcast_post_id == "null" or broadcast_post_id is null') + + # use 1 retweet as stand-in for original tweet + df_retweets = df_retweets.dropDuplicates(['broadcast_post_id']) + + df_deduped = df_retweets.union(df_no_retweets) + + self.counts['labeled posts'] = df_deduped.count() + + return df_deduped + + # posts used in predictions. + # times in ms. + def query_unlabeled_posts(self, start_time=0, end_time=0): + df = self.query_posts(start_time, end_time) + + self.counts['unlabeled posts'] = df.count() + + return df + + # base posts query. rm common tags. + # N.B. featurizer type doesn't matter, just get 1. + # times in ms. + def query_posts(self, start_time=0, end_time=0): + self.spark.collection = 'socialMediaPost' + df = self.spark.read()\ + .select(['_id', 'text', 'featurizer', 'broadcast_post_id', 'campaigns', + 'hashtags', 'lang', 'post_id', 'timestamp_ms'])\ + .where('lang = "en"')\ + .where('featurizer == "hashtag"')\ + .where(u_no_common_tags('hashtags')) + + if start_time: + df = df.where(df.timestamp_ms >= start_time)\ + .where(df.timestamp_ms < end_time)\ + .cache() + else: + df = df.cache() + + self.counts['en posts w/o common tags'] = df.count() + + return df + + +def clean_text(txt): + return ' '.join(pres_tokenize(txt, 'en')) + +u_clean_text = F.udf(clean_text, StringType()) + +def flatten(l): + flat_list = [item for sublist in l for item in sublist] + return list(set(flat_list)) + +u_flatten = F.udf(flatten, ArrayType(StringType())) + +def trunc_array(arr): + return arr[:1000] + +u_trunc_array = F.udf(trunc_array, ArrayType(StringType())) + +def no_common_tags(tags): + return len(set(map(lambda x: x.lower(), tags)) & set(COMMON_TAGS)) == 0 + +u_no_common_tags = F.udf(no_common_tags, BooleanType()) diff --git a/services/silk-specter/fasttext-pmi.ipynb.keep b/services/silk-specter/fasttext-pmi.ipynb.keep new file mode 100644 index 0000000..d9b1da9 --- /dev/null +++ b/services/silk-specter/fasttext-pmi.ipynb.keep @@ -0,0 +1,3556 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "``` $> script/docker/start-pyspark.sh cluster```" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting fasttext\n", + " Downloading fasttext-0.8.3.tar.gz (73kB)\n", + "\u001b[K 100% |████████████████████████████████| 81kB 2.7MB/s ta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: numpy>=1 in /opt/conda/lib/python3.5/site-packages (from fasttext)\n", + "Requirement already satisfied: future in /opt/conda/lib/python3.5/site-packages (from fasttext)\n", + "Building wheels for collected packages: fasttext\n", + " Running setup.py bdist_wheel for fasttext ... \u001b[?25ldone\n", + "\u001b[?25h Stored in directory: /home/jovyan/.cache/pip/wheels/55/0a/95/e23f773666d3487ee7456b220f7e8d37e99b74833b20dd06a0\n", + "Successfully built fasttext\n", + "Installing collected packages: fasttext\n", + "Successfully installed fasttext-0.8.3\n" + ] + } + ], + "source": [ + "!pip install fasttext" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pyspark.sql.functions as F\n", + "from pyspark.sql.types import *\n", + "import sys, os, json\n", + "from datetime import datetime\n", + "# sys.path.append(os.path.join(os.path.dirname(__file__), '../util'))\n", + "from mongo_spark_client import Client as SparkClient\n", + "from tokenizer import pres_tokenize\n", + "import fasttext\n", + "import numpy as np\n", + "from itertools import combinations\n", + "from nb_utils import pp\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "db conf: mongodb://mongo:27017\n" + ] + } + ], + "source": [ + "db_host = os.getenv('DB_HOST', 'mongo')\n", + "db_port = os.getenv('DB_PORT', 27017)\n", + "uri = 'mongodb://{}:{}'.format(db_host, db_port)\n", + "print('db conf:', uri)\n", + "spark_client = SparkClient(uri=uri, master='spark://master:7077')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Re-use methods from silk-specter" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def query_labeled_posts(start_time=1, end_time=1):\n", + " df = query_posts(start_time, end_time)\n", + "\n", + " df_retweets = df\\\n", + " .where('broadcast_post_id is not null')\n", + "\n", + " df_no_retweets = df\\\n", + " .where('broadcast_post_id == \"null\" or broadcast_post_id is null')\n", + "\n", + " # use 1 retweet as stand-in for original tweet\n", + " df_retweets = df_retweets.dropDuplicates(['broadcast_post_id'])\n", + "\n", + " df_deduped = df_retweets.union(df_no_retweets)\n", + "\n", + "\n", + " return df_deduped\n", + "\n", + "# base posts query. rm common tags.\n", + "# N.B. featurizer type doesn't matter, just get 1.\n", + "def query_posts(self, start_time=1, end_time=1):\n", + " spark_client.collection = 'socialMediaPost'\n", + " df = spark_client.read()\\\n", + " .select(['_id', 'text', 'featurizer', 'broadcast_post_id', 'campaigns',\n", + " 'hashtags', 'lang', 'post_id', 'timestamp_ms'])\\\n", + " .where('lang = \"en\"')\\\n", + " .where('featurizer == \"hashtag\"')\\\n", + " .where(u_no_common_tags('hashtags'))\\\n", + " .cache()\n", + "\n", + " return df\n", + "\n", + "def no_common_tags(tags):\n", + " return len(set(map(lambda x: x.lower(), tags)) & set(COMMON_TAGS)) == 0\n", + "\n", + "u_no_common_tags = F.udf(no_common_tags, BooleanType())\n", + "\n", + "# common twitter hashtags.\n", + "COMMON_TAGS = ['breaking', 'news', 'breakingnews', 'foxnews', 'job', 'jobs',\n", + " 'hiring', 'careerarc']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "StructType(List(StructField(_id,StructType(List(StructField(oid,StringType,true))),true),StructField(text,StringType,true),StructField(featurizer,StringType,true),StructField(broadcast_post_id,StringType,true),StructField(campaigns,ArrayType(StringType,true),true),StructField(hashtags,ArrayType(StringType,true),true),StructField(lang,StringType,true),StructField(post_id,StringType,true),StructField(timestamp_ms,DoubleType,true)))\n", + "count: 6337\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idtextfeaturizerbroadcast_post_idcampaignshashtagslangpost_idtimestamp_ms
0(597f78867cc64815add20b08,)RT @NickMosky: This makes more sense to me than planking https://t.co/MG3Q8xMp7E The new fad is to take selfies in front of Trump 🖕properties🖕.\\n\\n#RESIST #FillibusterGorsuch #TrumpRussia #billions… https://t.co/tdVsesxK5khashtag849041445126361090[600, 600, 600][RESIST, FillibusterGorsuch, TrumpRussia, billions]en8920914521965199361.501526e+12
1(597f76f47cc64815add1a67e,)RT @Homerx59: @TweetiepieHolly Me too... he could very well be worse. His #religiouszealot attitude is real where trump only beli…hashtag873060626406322176[300, 100, 200][religiouszealot]en8920897659981168641.501526e+12
2(597f9e5f7cc64815add29449,)RT @RickJoh05200551: Is Trump just another mob boss? ...a hardened criminal? https://t.co/M6zeTHUAa5 #SkinnyRepeal #Resist #TrumpRussia #S…hashtag890387033146961920[600, 400, 100][SkinnyRepeal, Resist, TrumpRussia]en8921320863998566431.501536e+12
3(597f785b7cc64815add2006d,)RT @YishaiFleisher: Join me @YishaiFleisher for the annual 9th of Av march around the walls of the Old City of #Jerusalem https://t.co/JRxv…hashtag890585584993001472[100, 100, 200][Jerusalem]en8920912693669109761.501526e+12
4(597f66eb7cc64815add0addd,)RT @PRETTYMUCH: celebrating one week of #WouldYouMind by shooting a q&amp;a video! what do you guys want to know? use #ASKPRETTYMUCH https://t.…hashtag891043343479652353[300, 400, 300][WouldYouMind, ASKPRETTYMUCH]en8920725466145955841.501522e+12
5(597f9e137cc64815add27e38,)RT @mcspocky: Dear tRump cultists…\\n#Resist #Resistance #TheResistance #Indivisible\\n#SCROTUS \\n#MAGA (by Jailing tRump) https://t.co/8FLw7Nxg…hashtag891483785560072192[400, 300, 100][Resist, Resistance, TheResistance, Indivisible, SCROTUS, MAGA]en8921317694697635841.501536e+12
6(597f9d237cc64815add23744,)RT @JoeBeertap: Trump’s Health Failing! Must Wear Body Suit, #ObamaCare Covers Cost\\n\\n#ACA\\n#AHCA\\n#Resist\\n#UniteBlue\\n#TrumpDontCare\\nhttps://t…hashtag891484275501150211[200, 400, 300][ObamaCare, ACA, AHCA, Resist, UniteBlue, TrumpDontCare]en8921307600936509441.501536e+12
7(597f6e697cc64815add140c7,)RT @RxImperator: ARMY+ Anti-´kidnapping´ squad shooting at protesters in Av. Rotaria in #SanCristobal #Tachira.This is a #CivilWar… #30Jul\\nA LA ALTURA D LA AV ROTARIA\\nTACHIRA \\nCONAS Y EL EJERCITO DISPARANDO\\n🚨🚨🚨🚨🚨🚨🚨🚨\\nRESISTENCIA\\n👇👇👇👇👇👇 https://t.co/vv8hPRwTPJhashtag891735372572610562[500, 500, 500][SanCristobal, Tachira, CivilWar, 30Jul]en8920805905094246401.501524e+12
8(597f6ad07cc64815add0fb47,)RT @wwsean08: Got to do a bit of night shooting last night #Astrophotography #Nikon #D3300 https://t.co/Cx90Ag044Nhashtag891762347492036608[300, 500, 200][Astrophotography, Nikon, D3300]en8920767244433285141.501523e+12
9(597f658f7cc64815add092c2,)RT @premgovindani: @SureshChavhanke #BapujiIsInnocent becoz Jodhpur police found no proof against Asaram Bapu Ji. @chander_hariom @CHAUHAN2…hashtag891828569877381120[400, 500, 200][BapujiIsInnocent]en8920710870049832961.501521e+12
10(597f64c37cc64815add08249,)RT @HeliumCine: Thanks to Eliot @ReallyEpicTuts for these beautiful pictures and glowing recommendation of the #HeliumCore #iPhone… The beautiful Nelson Lakes 🇳🇿 feat. my favourite shooting rig: iPhone 6s+ with @HeliumCine #heliumcore and… https://t.co/LOnFUSlxQGhashtag892055487553953793[100, 100, 200][HeliumCore, iPhone, heliumcore]en8920702312623390731.501521e+12
11(597f69247cc64815add0d868,)RT @JenPhillips721: A day in the life of an Agents of S.H.I.E.L.D. fan. We're screwed even during hiatus. 😭 #AgentsofSHIELD https://t.co/Dh… @JenPhillips721 Still, I bet it's gonna be a hell of a story.\\nOh wow, i'm worrying again about #AgentsofSHIELD. WHY… https://t.co/lIbkrVDvMthashtag892066481835462656[300, 300, 100][AgentsofSHIELD, AgentsofSHIELD]en8920749301998141441.501522e+12
12(597f65bf7cc64815add0970d,)RT @wittier: NYPD mugshot of David Bowie, arrested for possession of marijuana. ca. March 1976. https://t.co/EY8aN6vN5h #klout v…hashtag892069206904242176[300, 100, 600][klout]en8920712882562990101.501521e+12
13(597f6b687cc64815add107f8,)RT @sujit4superstar: Right now I'm shooting for @urstrulyMahesh's #BharathAneNenu which is my telugu debut.😍 - @Advani_Kiara https://t.co/w…hashtag892072666068209664[500, 300, 400][BharathAneNenu]en8920773637432770561.501523e+12
14(597f688e7cc64815add0cd54,)RT @kwilli1046: #Antifa says they protest for the average person. All I see is Felons looting businesses &amp; Destroying property. #G20 https:…hashtag892074259350142977[500, 600, 100][Antifa, G20]en8920743024131891251.501522e+12
15(597f76dd7cc64815add1a0c1,)RT @crooksandliars: #CLTV 12 Year Old Trump Super Fan On Fox &amp; Friends - 12-year-old Millie March, who went viral for her support o... http…hashtag892082947905507328[300, 500, 100][CLTV]en8920896699991080961.501526e+12
16(597f78147cc64815add1eefa,)RT @Koreandogs: July 12 First day of #Boknal 2017 ARAs protest at the #MoranMarket #Seongnam #EndDogMeatTrade #SouthKorea .@SBSNOW…hashtag892090652439269376[200, 100, 500][Boknal, MoranMarket, Seongnam, EndDogMeatTrade, SouthKorea]en8920909735468523531.501526e+12
17(597f9d367cc64815add23cb8,)RT @peterdaou: Four days after Trump's inauguration, I made a prediction 👇\\n\\n#Scaramucci #Mooch #Kelly #Reince #Spicer ... https://t.co/sVS0… Your daily reminder that GOP infighting and palace intrigue will be the Trump administration's biggest political challenge.hashtag892093931864850432[500, 600, 400][Scaramucci, Mooch, Kelly, Reince, Spicer]en8921308392613109771.501536e+12
18(597f9e9b7cc64815add2a627,)RT @johnlundin: 'How To Lose A Guy In 10 Days' - starring \"The Mooch\" and \"The #Trump\"\\nThe reviews are in: #unfit! #resist! https://t.co/tx…hashtag892113154439827457[200, 600, 500][Trump, unfit, resist]en8921323396684963841.501536e+12
19(597f9e817cc64815add29e83,)RT @DaShanneStokes: A new face won't change Trump's horrible message.\\n\\n#Scaramucci #resist #theresistance #trump https://t.co/7H7RnfUjBT Anthony Scaramucci's departure as WH communications director came hours into John Kelly's tenure as chief of staff https://t.co/H7iX1aiuCMhashtag892122539790659589[100, 500, 400][Scaramucci, resist, theresistance, trump]en8921322283350999051.501536e+12
20(597f9d787cc64815add2500b,)RT @rswfire: #HTTP301: This thread no longer exists here. You can find it online at https://t.co/bKTgrW2WOb. 🌐hashtag892128046299402240[400, 200, 400][HTTP301]en8921311192981708811.501536e+12
21(597f9d317cc64815add23b82,)RT @ReportUK: GRAPHIC CONTENT Crowd watch as man who raped toddler executed with machine gun in street #D31 https://t.co/ek4v8k88vthashtag892130821699756032[400, 300, 500][D31]en8921308227231293441.501536e+12
22(597f74997cc64815add1557c,)RT @JustinCCRW: 18°C on the day of the march :) The weather's going to be nice and brisk! Perfect for a Saturday march. https://t.co/MXgOq5… March with #TDPM2016 Sept 24! Speakers @DavidLepofsky @SisterMamaD @ingwongward @mel_graham @SilentHill_1998 @depalm https://t.co/V7n8YucihThashtag777865635250466816[600, 600, 500][TDPM2016]en8920872354950266941.501525e+12
23(597f63367cc64815add06355,)RT @physiceDom: Imagine going to jail because of a lizard https://t.co/0DfxlXmOz2 Drug-sniffing lizard joins Avondale Police Department https://t.co/F8T4VsUN8W #abc15 https://t.co/cxVZOkXHpVhashtag854181152420220932[500, 200, 400][abc15]en8920685661865123881.501521e+12
24(597f9cf47cc64815add22a05,)RT @TheWattyWay: I love it when an open house is bringing in my type of crowd. You've got 1.25hrs left! 60 Strange Street #dtk https://t.…hashtag868901072949174272[300, 300, 300][dtk]en8921305663838699521.501535e+12
25(597f9dae7cc64815add25faf,)RT @evelina_darling: Shooting for @WeAreHairy\\nHot 🔥 and beautiful solo 😻\\nYou want to look at it? 😏\\n@evelina_darling #pussy #hairy #summer h…hashtag891291319569702912[200, 300, 400][pussy, hairy, summer]en8921313435173273601.501536e+12
26(597f67667cc64815add0b742,)RT @LVNancy: Wonder if they were \"Asian\"🙄\\nAt least they're not saying \"Motive unknown\"\\n#Australia\\n#Terroist \\n#islam\\n#TravelBan\\nhttps://t.co…hashtag891758305739132928[400, 300, 500][Australia, Terroist, islam, TravelBan]en8920730605719920641.501522e+12
27(597f77577cc64815add1c000,)RT @petefrt: IBD/TIPP Poll: Public Turned Off By Media's Relentlessly Negative Coverage Of Trump \\n\\n#ThatsHowYouGetMoreTrump\\n\\n#p2…hashtag891762855330148353[500, 400, 500][ThatsHowYouGetMoreTrump, p2]en8920901800516280361.501526e+12
28(597f77507cc64815add1be44,)RT @alexgaipo: https://t.co/vk6WVqNjrm\\nThis was a fun day. #extraordinarycanadians #marshallmcluhan https://t.co/3DpTmQ1hvE Fun day of shooting with @DougCoupland \\n\\n#DroneBoy #torontodrone #uav https://t.co/mxbbhWx9TChashtag891801480323268608[200, 400, 600][extraordinarycanadians, marshallmcluhan, DroneBoy, torontodrone, uav]en8920901491018670081.501526e+12
29(597f76687cc64815add183bf,)RT @MarkRuffalo: What is the #MagnitskyAct, why is Putin trying to kill it and why was team Trump meeting Putin's people about it? G… Excellent NPR story on my testimony. Businessman Paints A Terrifying And Complex Picture Of Vladimir Putin's Russia https://t.co/hdXo9AlMiYhashtag891804908747239424[400, 200, 200][MagnitskyAct]en8920891779443220481.501526e+12
..............................
70(597f78987cc64815add20fbc,)RT @davidmputnam: Since becoming President, @realDonaldTrump has spent 35 days golfing and signed 0 healthcare bills into law. #MAGA 😂https…hashtag892031022262571008[600, 400, 400][MAGA]en8920915244055756801.501526e+12
71(597f76377cc64815add17761,)RT @funder: #TheResistance is made up of Dems, GOP-fmr Trump fans-etc. All Americans who are awake &amp; know exactly what should happen next:…hashtag892079764479102977[400, 100, 500][TheResistance]en8920889719451648001.501526e+12
72(597f6ec47cc64815add147b9,)RT @LeeJasper: No doubt the MSM Police Support Troll Group #PTSG will be out in force justifying this brutal attack by @WMPolice… @policemonitor funny how the cops don't like citizens filming them isn't it https://t.co/Lfy14lAHimhashtag892080264188493825[600, 400, 400][PTSG]en8920809720108564481.501524e+12
73(597f78617cc64815add201f8,)RT @citizentvkenya: Family reported him missing to the police on Sunday #MondaySpecialKE w/ @AnneKiguta https://t.co/luupKzlN4Rhashtag892090503579267074[200, 200, 300][MondaySpecialKE]en8920912971373404161.501526e+12
74(597f9cfd7cc64815add22cab,)RT @johnpavlovitz: Calling the Trump Administration a dumpster fire is an insult to dumpster fires. #ImpeachTrumphashtag892093890383228928[200, 300, 600][ImpeachTrump]en8921306042544578591.501535e+12
75(597f9cc27cc64815add21b9f,)RT @shomaristone: Trump insisted earlier today that there was \"no chaos\" in his White House. #Trump\\nhttps://t.co/ugK4ZOzurfhashtag892109489368911873[100, 300, 200][Trump]en8921303556412416061.501535e+12
76(597f9cfb7cc64815add22c0f,)RT @1more4_Trump: Likely the only reason he came on board! Smart on Trump's part!..As usual! https://t.co/dQmuVnNxQT Will always be grateful to the Mooch #Scaramucci for getting rid of the Reince rat!\\n\\nBest of luck to you Mooch! https://t.co/GduiGzLXu1hashtag892129211963408384[500, 200, 500][Scaramucci]en8921305942720143401.501535e+12
77(597f780e7cc64815add1ed7a,)RT @estoguy13: Perspective. There are good reasons for the #transmilitaryban. These reasons aren't based on BS.\\n\\nhttps://t.co/dFARujhstFhashtag890358239845650432[300, 300, 200][transmilitaryban]en8920909482887454721.501526e+12
78(597f9caf7cc64815add215b8,)RT @kindcutesteve: Trump Throws Temper Tantrum &amp; Calls GOP Senators Fools While Raging Over Healthcare Flop #p2 #TNTweeters #USlatino http…hashtag891381941513015296[100, 500, 200][p2, TNTweeters, USlatino]en8921302732062965761.501535e+12
79(597f654c7cc64815add08d15,)RT @TheCalebBond: Do you have a right wing friend? Make sure you call the police. https://t.co/ZGMwHbthTY Do you think you could spot the signs of right wing extremism? Do you know someone who would #CrossTheLine?… https://t.co/hI38RwEek2hashtag891895527247302656[200, 600, 400][CrossTheLine]en8920708041202565131.501521e+12
80(597f63a57cc64815add06c50,)RT @JoJoAngelus: What an #AH that Arsenal Owner. He should launch a TV Channel to save animals not shooting endangered animals… ''No feeling like shooting an endangered elephant': Arsenal owner launches TV channel dedicated to trophy hunting https://t.co/bNIKxcfBdHhashtag892065908906115073[500, 200, 300][AH]en8920690306635694081.501521e+12
81(597f67b47cc64815add0bd0f,)RT @TV__Newz: Black Women Lead #NoConfederate Online Protest Calling Out Controversial HBO Show https://t.co/z2ceuYwNqU https://t.co/QSIPAU…hashtag892072190799212544[300, 200, 300][NoConfederate]en8920733869434265601.501522e+12
82(597f77987cc64815add1d086,)RT @TomWellborn: Perhaps we shouldn't call them \"stray\" bullets.\\n\\n\"Bullet fired by an irresponsible gun wielder\". https://t.co/lXQaE4tM38 #WATCH Police: Teen Hit By Stray Bullet In #Feltonville, No Arrests Made https://t.co/kI5c6x8kKxhashtag892078987073261570[200, 600, 300][WATCH, Feltonville]en8920904525220208641.501526e+12
83(597f9df17cc64815add27410,)RT @Juliet777777: #ITALY Pro-Muslim leftists protest #DefendEurope SHIP SEND MUSLIMS BACK TO AFRICA\\nhttps://t.co/JUMnTSsRsx \\n#auspol #ukra…hashtag892126937677676544[600, 200, 600][ITALY, DefendEurope, auspol]en8921316272827432961.501536e+12
84(597f9e637cc64815add29575,)RT @MTaylorCanfield: Why are some police in low crime rate cities #Seattle/#Minneapolis so aggressive with use deadly force? https://t.co/u…hashtag888765586154463233[100, 400, 300][Seattle, Minneapolis]en8921321028413562881.501536e+12
85(597f76517cc64815add17e39,)RT @ShaunKing: I'm seeing a swiftness of justice for #JustineDamond that I simply have never seen before for a single Black victim… My Latest.\\n\\nBlack victims of white cops simply do not get the universal support &amp; justice afforded #JustineDamond.… https://t.co/UtBLkidGqFhashtag889911899474755584[300, 600, 600][JustineDamond, JustineDamond]en8920890794746552321.501526e+12
86(597f9d187cc64815add2342e,)RT @kindcutesteve: Sally Yates: Trump is trying to 'dismantle the rule of law' #p2 #TNTweeters #USlatino #UniteBlue https://t.co/kRb40jp0vYhashtag891037057102954496[200, 200, 500][p2, TNTweeters, USlatino, UniteBlue]en8921307166365040641.501536e+12
87(597f9cf77cc64815add22aa5,)RT @Valerian_2017: #Scaramucci, one week in: civil war in the White House and an even wilder Trump\\n#NewTrumpAdminScandals \\n#VoteThemOut\\nhtt…hashtag891296821347328000[600, 500, 400][Scaramucci, NewTrumpAdminScandals, VoteThemOut]en8921305751795015701.501535e+12
88(597f6bf57cc64815add1132d,)RT @TheNehaKhan: This is real India. A CRPF jawan stands guard while a J&amp;K Police man offers Namaz in #Kashmir. Brothers in arms! 🇮🇳…hashtag891396025033863168[100, 600, 500][Kashmir]en8920779563060592641.501523e+12
89(597f9ce37cc64815add2251a,)RT @RealEagleBites: BUSTED: Trump smear hoaxers #FusionGPS were PAID by Russia &amp; COLLUDED w liberal media, now DESPERATE CNN COVERS-UP http…hashtag891689168908075009[400, 400, 600][FusionGPS]en8921304955631288321.501535e+12
90(597f776b7cc64815add1c4f0,)RT @KLSouth: IRS Rehired Employees That Falsified Documents, Had Unauthorized Access to Taxpayer Information. #Trump @POTUS https://t.co/HO…hashtag891733931581734913[600, 400, 400][Trump]en8920902633169510441.501526e+12
91(597f63467cc64815add0649c,)RT @DebbieMcqueen11: .@moonriver365 .@nbcnews🔊Protest #StopBoknal2017 #EndDogMeatTrade #SouthKorea🔊July 12 #NY🔊Korean Consulate🔊https://t.c…hashtag891889754178023425[200, 400, 200][StopBoknal2017, EndDogMeatTrade, SouthKorea, NY]en8920686320118661121.501521e+12
92(597f9ebb7cc64815add2af84,)RT @DrJScofield: 🇺🇸🇺🇸HAPPY TRUMP MONDAY🇺🇸🇺🇸 this is Dr. J #TrumpArmy #MAGA 🇺🇸🇺🇸🇺🇸 https://t.co/b0cOJg06L7hashtag892014168152444928[400, 400, 200][TrumpArmy, MAGA]en8921324742219735051.501536e+12
93(597f66ab7cc64815add0a8b7,)RT @theoriginalmel: Kanesatake tomorrow for peaceful protest at noon. Who's coming? #Kanesatake #Oka #Kanehsatà:ke #Kanien'kéha:kahashtag892071656805552128[600, 100, 300][Kanesatake, Oka, Kanehsatà, Kanien]en8920722779398963201.501522e+12
94(597f77637cc64815add1c2e6,)RT @TheDailyEdge: #WhiteHouseChaos After a disastrous first 6 months, Republicans lose confidence in @realDonaldTrump https://t.co/WxdvpFyX…hashtag892075431725330434[200, 200, 600][WhiteHouseChaos]en8920902316583444501.501526e+12
95(597f76df7cc64815add1a139,)RT @LindaSuhler: This pretty much makes @jk_rowling a liar, promoting #FakeNews, doesn't it?\\nDecent people don't do this.\\nObsessed h… JK Rowling hates Trump so much she is happy for 11 million people to wrongly believe he snubbed a disabled boy.… https://t.co/GpVShQn2eFhashtag892081040734289920[500, 100, 600][FakeNews]en8920896781276733441.501526e+12
96(597f76b07cc64815add19579,)RT @Superstarlucita: Protest #ALDUBPowerCouple https://t.co/LD84cGQXyz Word game. Enjoy! #ALDUBPowerCouple @ALDub_RTeam @ALDubFTnQT @ALDUB_inARTeam @TeamKPTY_ORIG26 @AMInaticsOFC… https://t.co/oR01ylbgybhashtag892082505574227970[100, 600, 500][ALDUBPowerCouple, ALDUBPowerCouple]en8920894780132188171.501526e+12
97(597f74e17cc64815add15b01,)RT @SeanH_999: He's a role model to us all. I'm a #TransAlly &amp; I will absolutely make it my business to continue to support &amp; stan… My twin brother,a proud trans man&amp;police officer flying the rainbows flag!So proud! #liverpoolpride2017… https://t.co/9FQzRJOaq1hashtag892084621562826753[100, 600, 500][TransAlly, liverpoolpride2017]en8920875365537873921.501525e+12
98(597f76b47cc64815add1969e,)RT @ZaibatsuNews: Trump embodies every one of the Seven Deadly Sins https://t.co/cbHzBLjlDn #p2 #ccot https://t.co/JY3drYdi81hashtag892086734628040708[300, 400, 100][p2, ccot]en8920894985483182081.501526e+12
99(597f77627cc64815add1c27c,)RT @diane3strikes: Holy #brainwashing batman! #NotNormal #syophantsworship #squirrelsdancingwithknivesintheirheads https://t.co/MQ32i8x1Iu Nearly all Cabinet officials attend Bible lessons with right-wing pastor who compares Trump to biblical heroes https://t.co/ngZ0KwYwHChashtag892089125079318530[200, 100, 600][brainwashing, NotNormal, syophantsworship, squirrelsdancingwithknivesintheirheads]en8920902248006082561.501526e+12
\n", + "

100 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " _id \\\n", + "0 (597f78867cc64815add20b08,) \n", + "1 (597f76f47cc64815add1a67e,) \n", + "2 (597f9e5f7cc64815add29449,) \n", + "3 (597f785b7cc64815add2006d,) \n", + "4 (597f66eb7cc64815add0addd,) \n", + "5 (597f9e137cc64815add27e38,) \n", + "6 (597f9d237cc64815add23744,) \n", + "7 (597f6e697cc64815add140c7,) \n", + "8 (597f6ad07cc64815add0fb47,) \n", + "9 (597f658f7cc64815add092c2,) \n", + "10 (597f64c37cc64815add08249,) \n", + "11 (597f69247cc64815add0d868,) \n", + "12 (597f65bf7cc64815add0970d,) \n", + "13 (597f6b687cc64815add107f8,) \n", + "14 (597f688e7cc64815add0cd54,) \n", + "15 (597f76dd7cc64815add1a0c1,) \n", + "16 (597f78147cc64815add1eefa,) \n", + "17 (597f9d367cc64815add23cb8,) \n", + "18 (597f9e9b7cc64815add2a627,) \n", + "19 (597f9e817cc64815add29e83,) \n", + "20 (597f9d787cc64815add2500b,) \n", + "21 (597f9d317cc64815add23b82,) \n", + "22 (597f74997cc64815add1557c,) \n", + "23 (597f63367cc64815add06355,) \n", + "24 (597f9cf47cc64815add22a05,) \n", + "25 (597f9dae7cc64815add25faf,) \n", + "26 (597f67667cc64815add0b742,) \n", + "27 (597f77577cc64815add1c000,) \n", + "28 (597f77507cc64815add1be44,) \n", + "29 (597f76687cc64815add183bf,) \n", + ".. ... \n", + "70 (597f78987cc64815add20fbc,) \n", + "71 (597f76377cc64815add17761,) \n", + "72 (597f6ec47cc64815add147b9,) \n", + "73 (597f78617cc64815add201f8,) \n", + "74 (597f9cfd7cc64815add22cab,) \n", + "75 (597f9cc27cc64815add21b9f,) \n", + "76 (597f9cfb7cc64815add22c0f,) \n", + "77 (597f780e7cc64815add1ed7a,) \n", + "78 (597f9caf7cc64815add215b8,) \n", + "79 (597f654c7cc64815add08d15,) \n", + "80 (597f63a57cc64815add06c50,) \n", + "81 (597f67b47cc64815add0bd0f,) \n", + "82 (597f77987cc64815add1d086,) \n", + "83 (597f9df17cc64815add27410,) \n", + "84 (597f9e637cc64815add29575,) \n", + "85 (597f76517cc64815add17e39,) \n", + "86 (597f9d187cc64815add2342e,) \n", + "87 (597f9cf77cc64815add22aa5,) \n", + "88 (597f6bf57cc64815add1132d,) \n", + "89 (597f9ce37cc64815add2251a,) \n", + "90 (597f776b7cc64815add1c4f0,) \n", + "91 (597f63467cc64815add0649c,) \n", + "92 (597f9ebb7cc64815add2af84,) \n", + "93 (597f66ab7cc64815add0a8b7,) \n", + "94 (597f77637cc64815add1c2e6,) \n", + "95 (597f76df7cc64815add1a139,) \n", + "96 (597f76b07cc64815add19579,) \n", + "97 (597f74e17cc64815add15b01,) \n", + "98 (597f76b47cc64815add1969e,) \n", + "99 (597f77627cc64815add1c27c,) \n", + "\n", + " text \\\n", + "0 RT @NickMosky: This makes more sense to me than planking https://t.co/MG3Q8xMp7E The new fad is to take selfies in front of Trump 🖕properties🖕.\\n\\n#RESIST #FillibusterGorsuch #TrumpRussia #billions… https://t.co/tdVsesxK5k \n", + "1 RT @Homerx59: @TweetiepieHolly Me too... he could very well be worse. His #religiouszealot attitude is real where trump only beli… \n", + "2 RT @RickJoh05200551: Is Trump just another mob boss? ...a hardened criminal? https://t.co/M6zeTHUAa5 #SkinnyRepeal #Resist #TrumpRussia #S… \n", + "3 RT @YishaiFleisher: Join me @YishaiFleisher for the annual 9th of Av march around the walls of the Old City of #Jerusalem https://t.co/JRxv… \n", + "4 RT @PRETTYMUCH: celebrating one week of #WouldYouMind by shooting a q&a video! what do you guys want to know? use #ASKPRETTYMUCH https://t.… \n", + "5 RT @mcspocky: Dear tRump cultists…\\n#Resist #Resistance #TheResistance #Indivisible\\n#SCROTUS \\n#MAGA (by Jailing tRump) https://t.co/8FLw7Nxg… \n", + "6 RT @JoeBeertap: Trump’s Health Failing! Must Wear Body Suit, #ObamaCare Covers Cost\\n\\n#ACA\\n#AHCA\\n#Resist\\n#UniteBlue\\n#TrumpDontCare\\nhttps://t… \n", + "7 RT @RxImperator: ARMY+ Anti-´kidnapping´ squad shooting at protesters in Av. Rotaria in #SanCristobal #Tachira.This is a #CivilWar… #30Jul\\nA LA ALTURA D LA AV ROTARIA\\nTACHIRA \\nCONAS Y EL EJERCITO DISPARANDO\\n🚨🚨🚨🚨🚨🚨🚨🚨\\nRESISTENCIA\\n👇👇👇👇👇👇 https://t.co/vv8hPRwTPJ \n", + "8 RT @wwsean08: Got to do a bit of night shooting last night #Astrophotography #Nikon #D3300 https://t.co/Cx90Ag044N \n", + "9 RT @premgovindani: @SureshChavhanke #BapujiIsInnocent becoz Jodhpur police found no proof against Asaram Bapu Ji. @chander_hariom @CHAUHAN2… \n", + "10 RT @HeliumCine: Thanks to Eliot @ReallyEpicTuts for these beautiful pictures and glowing recommendation of the #HeliumCore #iPhone… The beautiful Nelson Lakes 🇳🇿 feat. my favourite shooting rig: iPhone 6s+ with @HeliumCine #heliumcore and… https://t.co/LOnFUSlxQG \n", + "11 RT @JenPhillips721: A day in the life of an Agents of S.H.I.E.L.D. fan. We're screwed even during hiatus. 😭 #AgentsofSHIELD https://t.co/Dh… @JenPhillips721 Still, I bet it's gonna be a hell of a story.\\nOh wow, i'm worrying again about #AgentsofSHIELD. WHY… https://t.co/lIbkrVDvMt \n", + "12 RT @wittier: NYPD mugshot of David Bowie, arrested for possession of marijuana. ca. March 1976. https://t.co/EY8aN6vN5h #klout v… \n", + "13 RT @sujit4superstar: Right now I'm shooting for @urstrulyMahesh's #BharathAneNenu which is my telugu debut.😍 - @Advani_Kiara https://t.co/w… \n", + "14 RT @kwilli1046: #Antifa says they protest for the average person. All I see is Felons looting businesses & Destroying property. #G20 https:… \n", + "15 RT @crooksandliars: #CLTV 12 Year Old Trump Super Fan On Fox & Friends - 12-year-old Millie March, who went viral for her support o... http… \n", + "16 RT @Koreandogs: July 12 First day of #Boknal 2017 ARAs protest at the #MoranMarket #Seongnam #EndDogMeatTrade #SouthKorea .@SBSNOW… \n", + "17 RT @peterdaou: Four days after Trump's inauguration, I made a prediction 👇\\n\\n#Scaramucci #Mooch #Kelly #Reince #Spicer ... https://t.co/sVS0… Your daily reminder that GOP infighting and palace intrigue will be the Trump administration's biggest political challenge. \n", + "18 RT @johnlundin: 'How To Lose A Guy In 10 Days' - starring \"The Mooch\" and \"The #Trump\"\\nThe reviews are in: #unfit! #resist! https://t.co/tx… \n", + "19 RT @DaShanneStokes: A new face won't change Trump's horrible message.\\n\\n#Scaramucci #resist #theresistance #trump https://t.co/7H7RnfUjBT Anthony Scaramucci's departure as WH communications director came hours into John Kelly's tenure as chief of staff https://t.co/H7iX1aiuCM \n", + "20 RT @rswfire: #HTTP301: This thread no longer exists here. You can find it online at https://t.co/bKTgrW2WOb. 🌐 \n", + "21 RT @ReportUK: GRAPHIC CONTENT Crowd watch as man who raped toddler executed with machine gun in street #D31 https://t.co/ek4v8k88vt \n", + "22 RT @JustinCCRW: 18°C on the day of the march :) The weather's going to be nice and brisk! Perfect for a Saturday march. https://t.co/MXgOq5… March with #TDPM2016 Sept 24! Speakers @DavidLepofsky @SisterMamaD @ingwongward @mel_graham @SilentHill_1998 @depalm https://t.co/V7n8YucihT \n", + "23 RT @physiceDom: Imagine going to jail because of a lizard https://t.co/0DfxlXmOz2 Drug-sniffing lizard joins Avondale Police Department https://t.co/F8T4VsUN8W #abc15 https://t.co/cxVZOkXHpV \n", + "24 RT @TheWattyWay: I love it when an open house is bringing in my type of crowd. You've got 1.25hrs left! 60 Strange Street #dtk https://t.… \n", + "25 RT @evelina_darling: Shooting for @WeAreHairy\\nHot 🔥 and beautiful solo 😻\\nYou want to look at it? 😏\\n@evelina_darling #pussy #hairy #summer h… \n", + "26 RT @LVNancy: Wonder if they were \"Asian\"🙄\\nAt least they're not saying \"Motive unknown\"\\n#Australia\\n#Terroist \\n#islam\\n#TravelBan\\nhttps://t.co… \n", + "27 RT @petefrt: IBD/TIPP Poll: Public Turned Off By Media's Relentlessly Negative Coverage Of Trump \\n\\n#ThatsHowYouGetMoreTrump\\n\\n#p2… \n", + "28 RT @alexgaipo: https://t.co/vk6WVqNjrm\\nThis was a fun day. #extraordinarycanadians #marshallmcluhan https://t.co/3DpTmQ1hvE Fun day of shooting with @DougCoupland \\n\\n#DroneBoy #torontodrone #uav https://t.co/mxbbhWx9TC \n", + "29 RT @MarkRuffalo: What is the #MagnitskyAct, why is Putin trying to kill it and why was team Trump meeting Putin's people about it? G… Excellent NPR story on my testimony. Businessman Paints A Terrifying And Complex Picture Of Vladimir Putin's Russia https://t.co/hdXo9AlMiY \n", + ".. ... \n", + "70 RT @davidmputnam: Since becoming President, @realDonaldTrump has spent 35 days golfing and signed 0 healthcare bills into law. #MAGA 😂https… \n", + "71 RT @funder: #TheResistance is made up of Dems, GOP-fmr Trump fans-etc. All Americans who are awake & know exactly what should happen next:… \n", + "72 RT @LeeJasper: No doubt the MSM Police Support Troll Group #PTSG will be out in force justifying this brutal attack by @WMPolice… @policemonitor funny how the cops don't like citizens filming them isn't it https://t.co/Lfy14lAHim \n", + "73 RT @citizentvkenya: Family reported him missing to the police on Sunday #MondaySpecialKE w/ @AnneKiguta https://t.co/luupKzlN4R \n", + "74 RT @johnpavlovitz: Calling the Trump Administration a dumpster fire is an insult to dumpster fires. #ImpeachTrump \n", + "75 RT @shomaristone: Trump insisted earlier today that there was \"no chaos\" in his White House. #Trump\\nhttps://t.co/ugK4ZOzurf \n", + "76 RT @1more4_Trump: Likely the only reason he came on board! Smart on Trump's part!..As usual! https://t.co/dQmuVnNxQT Will always be grateful to the Mooch #Scaramucci for getting rid of the Reince rat!\\n\\nBest of luck to you Mooch! https://t.co/GduiGzLXu1 \n", + "77 RT @estoguy13: Perspective. There are good reasons for the #transmilitaryban. These reasons aren't based on BS.\\n\\nhttps://t.co/dFARujhstF \n", + "78 RT @kindcutesteve: Trump Throws Temper Tantrum & Calls GOP Senators Fools While Raging Over Healthcare Flop #p2 #TNTweeters #USlatino http… \n", + "79 RT @TheCalebBond: Do you have a right wing friend? Make sure you call the police. https://t.co/ZGMwHbthTY Do you think you could spot the signs of right wing extremism? Do you know someone who would #CrossTheLine?… https://t.co/hI38RwEek2 \n", + "80 RT @JoJoAngelus: What an #AH that Arsenal Owner. He should launch a TV Channel to save animals not shooting endangered animals… ''No feeling like shooting an endangered elephant': Arsenal owner launches TV channel dedicated to trophy hunting https://t.co/bNIKxcfBdH \n", + "81 RT @TV__Newz: Black Women Lead #NoConfederate Online Protest Calling Out Controversial HBO Show https://t.co/z2ceuYwNqU https://t.co/QSIPAU… \n", + "82 RT @TomWellborn: Perhaps we shouldn't call them \"stray\" bullets.\\n\\n\"Bullet fired by an irresponsible gun wielder\". https://t.co/lXQaE4tM38 #WATCH Police: Teen Hit By Stray Bullet In #Feltonville, No Arrests Made https://t.co/kI5c6x8kKx \n", + "83 RT @Juliet777777: #ITALY Pro-Muslim leftists protest #DefendEurope SHIP SEND MUSLIMS BACK TO AFRICA\\nhttps://t.co/JUMnTSsRsx \\n#auspol #ukra… \n", + "84 RT @MTaylorCanfield: Why are some police in low crime rate cities #Seattle/#Minneapolis so aggressive with use deadly force? https://t.co/u… \n", + "85 RT @ShaunKing: I'm seeing a swiftness of justice for #JustineDamond that I simply have never seen before for a single Black victim… My Latest.\\n\\nBlack victims of white cops simply do not get the universal support & justice afforded #JustineDamond.… https://t.co/UtBLkidGqF \n", + "86 RT @kindcutesteve: Sally Yates: Trump is trying to 'dismantle the rule of law' #p2 #TNTweeters #USlatino #UniteBlue https://t.co/kRb40jp0vY \n", + "87 RT @Valerian_2017: #Scaramucci, one week in: civil war in the White House and an even wilder Trump\\n#NewTrumpAdminScandals \\n#VoteThemOut\\nhtt… \n", + "88 RT @TheNehaKhan: This is real India. A CRPF jawan stands guard while a J&K Police man offers Namaz in #Kashmir. Brothers in arms! 🇮🇳… \n", + "89 RT @RealEagleBites: BUSTED: Trump smear hoaxers #FusionGPS were PAID by Russia & COLLUDED w liberal media, now DESPERATE CNN COVERS-UP http… \n", + "90 RT @KLSouth: IRS Rehired Employees That Falsified Documents, Had Unauthorized Access to Taxpayer Information. #Trump @POTUS https://t.co/HO… \n", + "91 RT @DebbieMcqueen11: .@moonriver365 .@nbcnews🔊Protest #StopBoknal2017 #EndDogMeatTrade #SouthKorea🔊July 12 #NY🔊Korean Consulate🔊https://t.c… \n", + "92 RT @DrJScofield: 🇺🇸🇺🇸HAPPY TRUMP MONDAY🇺🇸🇺🇸 this is Dr. J #TrumpArmy #MAGA 🇺🇸🇺🇸🇺🇸 https://t.co/b0cOJg06L7 \n", + "93 RT @theoriginalmel: Kanesatake tomorrow for peaceful protest at noon. Who's coming? #Kanesatake #Oka #Kanehsatà:ke #Kanien'kéha:ka \n", + "94 RT @TheDailyEdge: #WhiteHouseChaos After a disastrous first 6 months, Republicans lose confidence in @realDonaldTrump https://t.co/WxdvpFyX… \n", + "95 RT @LindaSuhler: This pretty much makes @jk_rowling a liar, promoting #FakeNews, doesn't it?\\nDecent people don't do this.\\nObsessed h… JK Rowling hates Trump so much she is happy for 11 million people to wrongly believe he snubbed a disabled boy.… https://t.co/GpVShQn2eF \n", + "96 RT @Superstarlucita: Protest #ALDUBPowerCouple https://t.co/LD84cGQXyz Word game. Enjoy! #ALDUBPowerCouple @ALDub_RTeam @ALDubFTnQT @ALDUB_inARTeam @TeamKPTY_ORIG26 @AMInaticsOFC… https://t.co/oR01ylbgyb \n", + "97 RT @SeanH_999: He's a role model to us all. I'm a #TransAlly & I will absolutely make it my business to continue to support & stan… My twin brother,a proud trans man&police officer flying the rainbows flag!So proud! #liverpoolpride2017… https://t.co/9FQzRJOaq1 \n", + "98 RT @ZaibatsuNews: Trump embodies every one of the Seven Deadly Sins https://t.co/cbHzBLjlDn #p2 #ccot https://t.co/JY3drYdi81 \n", + "99 RT @diane3strikes: Holy #brainwashing batman! #NotNormal #syophantsworship #squirrelsdancingwithknivesintheirheads https://t.co/MQ32i8x1Iu Nearly all Cabinet officials attend Bible lessons with right-wing pastor who compares Trump to biblical heroes https://t.co/ngZ0KwYwHC \n", + "\n", + " featurizer broadcast_post_id campaigns \\\n", + "0 hashtag 849041445126361090 [600, 600, 600] \n", + "1 hashtag 873060626406322176 [300, 100, 200] \n", + "2 hashtag 890387033146961920 [600, 400, 100] \n", + "3 hashtag 890585584993001472 [100, 100, 200] \n", + "4 hashtag 891043343479652353 [300, 400, 300] \n", + "5 hashtag 891483785560072192 [400, 300, 100] \n", + "6 hashtag 891484275501150211 [200, 400, 300] \n", + "7 hashtag 891735372572610562 [500, 500, 500] \n", + "8 hashtag 891762347492036608 [300, 500, 200] \n", + "9 hashtag 891828569877381120 [400, 500, 200] \n", + "10 hashtag 892055487553953793 [100, 100, 200] \n", + "11 hashtag 892066481835462656 [300, 300, 100] \n", + "12 hashtag 892069206904242176 [300, 100, 600] \n", + "13 hashtag 892072666068209664 [500, 300, 400] \n", + "14 hashtag 892074259350142977 [500, 600, 100] \n", + "15 hashtag 892082947905507328 [300, 500, 100] \n", + "16 hashtag 892090652439269376 [200, 100, 500] \n", + "17 hashtag 892093931864850432 [500, 600, 400] \n", + "18 hashtag 892113154439827457 [200, 600, 500] \n", + "19 hashtag 892122539790659589 [100, 500, 400] \n", + "20 hashtag 892128046299402240 [400, 200, 400] \n", + "21 hashtag 892130821699756032 [400, 300, 500] \n", + "22 hashtag 777865635250466816 [600, 600, 500] \n", + "23 hashtag 854181152420220932 [500, 200, 400] \n", + "24 hashtag 868901072949174272 [300, 300, 300] \n", + "25 hashtag 891291319569702912 [200, 300, 400] \n", + "26 hashtag 891758305739132928 [400, 300, 500] \n", + "27 hashtag 891762855330148353 [500, 400, 500] \n", + "28 hashtag 891801480323268608 [200, 400, 600] \n", + "29 hashtag 891804908747239424 [400, 200, 200] \n", + ".. ... ... ... \n", + "70 hashtag 892031022262571008 [600, 400, 400] \n", + "71 hashtag 892079764479102977 [400, 100, 500] \n", + "72 hashtag 892080264188493825 [600, 400, 400] \n", + "73 hashtag 892090503579267074 [200, 200, 300] \n", + "74 hashtag 892093890383228928 [200, 300, 600] \n", + "75 hashtag 892109489368911873 [100, 300, 200] \n", + "76 hashtag 892129211963408384 [500, 200, 500] \n", + "77 hashtag 890358239845650432 [300, 300, 200] \n", + "78 hashtag 891381941513015296 [100, 500, 200] \n", + "79 hashtag 891895527247302656 [200, 600, 400] \n", + "80 hashtag 892065908906115073 [500, 200, 300] \n", + "81 hashtag 892072190799212544 [300, 200, 300] \n", + "82 hashtag 892078987073261570 [200, 600, 300] \n", + "83 hashtag 892126937677676544 [600, 200, 600] \n", + "84 hashtag 888765586154463233 [100, 400, 300] \n", + "85 hashtag 889911899474755584 [300, 600, 600] \n", + "86 hashtag 891037057102954496 [200, 200, 500] \n", + "87 hashtag 891296821347328000 [600, 500, 400] \n", + "88 hashtag 891396025033863168 [100, 600, 500] \n", + "89 hashtag 891689168908075009 [400, 400, 600] \n", + "90 hashtag 891733931581734913 [600, 400, 400] \n", + "91 hashtag 891889754178023425 [200, 400, 200] \n", + "92 hashtag 892014168152444928 [400, 400, 200] \n", + "93 hashtag 892071656805552128 [600, 100, 300] \n", + "94 hashtag 892075431725330434 [200, 200, 600] \n", + "95 hashtag 892081040734289920 [500, 100, 600] \n", + "96 hashtag 892082505574227970 [100, 600, 500] \n", + "97 hashtag 892084621562826753 [100, 600, 500] \n", + "98 hashtag 892086734628040708 [300, 400, 100] \n", + "99 hashtag 892089125079318530 [200, 100, 600] \n", + "\n", + " hashtags \\\n", + "0 [RESIST, FillibusterGorsuch, TrumpRussia, billions] \n", + "1 [religiouszealot] \n", + "2 [SkinnyRepeal, Resist, TrumpRussia] \n", + "3 [Jerusalem] \n", + "4 [WouldYouMind, ASKPRETTYMUCH] \n", + "5 [Resist, Resistance, TheResistance, Indivisible, SCROTUS, MAGA] \n", + "6 [ObamaCare, ACA, AHCA, Resist, UniteBlue, TrumpDontCare] \n", + "7 [SanCristobal, Tachira, CivilWar, 30Jul] \n", + "8 [Astrophotography, Nikon, D3300] \n", + "9 [BapujiIsInnocent] \n", + "10 [HeliumCore, iPhone, heliumcore] \n", + "11 [AgentsofSHIELD, AgentsofSHIELD] \n", + "12 [klout] \n", + "13 [BharathAneNenu] \n", + "14 [Antifa, G20] \n", + "15 [CLTV] \n", + "16 [Boknal, MoranMarket, Seongnam, EndDogMeatTrade, SouthKorea] \n", + "17 [Scaramucci, Mooch, Kelly, Reince, Spicer] \n", + "18 [Trump, unfit, resist] \n", + "19 [Scaramucci, resist, theresistance, trump] \n", + "20 [HTTP301] \n", + "21 [D31] \n", + "22 [TDPM2016] \n", + "23 [abc15] \n", + "24 [dtk] \n", + "25 [pussy, hairy, summer] \n", + "26 [Australia, Terroist, islam, TravelBan] \n", + "27 [ThatsHowYouGetMoreTrump, p2] \n", + "28 [extraordinarycanadians, marshallmcluhan, DroneBoy, torontodrone, uav] \n", + "29 [MagnitskyAct] \n", + ".. ... \n", + "70 [MAGA] \n", + "71 [TheResistance] \n", + "72 [PTSG] \n", + "73 [MondaySpecialKE] \n", + "74 [ImpeachTrump] \n", + "75 [Trump] \n", + "76 [Scaramucci] \n", + "77 [transmilitaryban] \n", + "78 [p2, TNTweeters, USlatino] \n", + "79 [CrossTheLine] \n", + "80 [AH] \n", + "81 [NoConfederate] \n", + "82 [WATCH, Feltonville] \n", + "83 [ITALY, DefendEurope, auspol] \n", + "84 [Seattle, Minneapolis] \n", + "85 [JustineDamond, JustineDamond] \n", + "86 [p2, TNTweeters, USlatino, UniteBlue] \n", + "87 [Scaramucci, NewTrumpAdminScandals, VoteThemOut] \n", + "88 [Kashmir] \n", + "89 [FusionGPS] \n", + "90 [Trump] \n", + "91 [StopBoknal2017, EndDogMeatTrade, SouthKorea, NY] \n", + "92 [TrumpArmy, MAGA] \n", + "93 [Kanesatake, Oka, Kanehsatà, Kanien] \n", + "94 [WhiteHouseChaos] \n", + "95 [FakeNews] \n", + "96 [ALDUBPowerCouple, ALDUBPowerCouple] \n", + "97 [TransAlly, liverpoolpride2017] \n", + "98 [p2, ccot] \n", + "99 [brainwashing, NotNormal, syophantsworship, squirrelsdancingwithknivesintheirheads] \n", + "\n", + " lang post_id timestamp_ms \n", + "0 en 892091452196519936 1.501526e+12 \n", + "1 en 892089765998116864 1.501526e+12 \n", + "2 en 892132086399856643 1.501536e+12 \n", + "3 en 892091269366910976 1.501526e+12 \n", + "4 en 892072546614595584 1.501522e+12 \n", + "5 en 892131769469763584 1.501536e+12 \n", + "6 en 892130760093650944 1.501536e+12 \n", + "7 en 892080590509424640 1.501524e+12 \n", + "8 en 892076724443328514 1.501523e+12 \n", + "9 en 892071087004983296 1.501521e+12 \n", + "10 en 892070231262339073 1.501521e+12 \n", + "11 en 892074930199814144 1.501522e+12 \n", + "12 en 892071288256299010 1.501521e+12 \n", + "13 en 892077363743277056 1.501523e+12 \n", + "14 en 892074302413189125 1.501522e+12 \n", + "15 en 892089669999108096 1.501526e+12 \n", + "16 en 892090973546852353 1.501526e+12 \n", + "17 en 892130839261310977 1.501536e+12 \n", + "18 en 892132339668496384 1.501536e+12 \n", + "19 en 892132228335099905 1.501536e+12 \n", + "20 en 892131119298170881 1.501536e+12 \n", + "21 en 892130822723129344 1.501536e+12 \n", + "22 en 892087235495026694 1.501525e+12 \n", + "23 en 892068566186512388 1.501521e+12 \n", + "24 en 892130566383869952 1.501535e+12 \n", + "25 en 892131343517327360 1.501536e+12 \n", + "26 en 892073060571992064 1.501522e+12 \n", + "27 en 892090180051628036 1.501526e+12 \n", + "28 en 892090149101867008 1.501526e+12 \n", + "29 en 892089177944322048 1.501526e+12 \n", + ".. ... ... ... \n", + "70 en 892091524405575680 1.501526e+12 \n", + "71 en 892088971945164800 1.501526e+12 \n", + "72 en 892080972010856448 1.501524e+12 \n", + "73 en 892091297137340416 1.501526e+12 \n", + "74 en 892130604254457859 1.501535e+12 \n", + "75 en 892130355641241606 1.501535e+12 \n", + "76 en 892130594272014340 1.501535e+12 \n", + "77 en 892090948288745472 1.501526e+12 \n", + "78 en 892130273206296576 1.501535e+12 \n", + "79 en 892070804120256513 1.501521e+12 \n", + "80 en 892069030663569408 1.501521e+12 \n", + "81 en 892073386943426560 1.501522e+12 \n", + "82 en 892090452522020864 1.501526e+12 \n", + "83 en 892131627282743296 1.501536e+12 \n", + "84 en 892132102841356288 1.501536e+12 \n", + "85 en 892089079474655232 1.501526e+12 \n", + "86 en 892130716636504064 1.501536e+12 \n", + "87 en 892130575179501570 1.501535e+12 \n", + "88 en 892077956306059264 1.501523e+12 \n", + "89 en 892130495563128832 1.501535e+12 \n", + "90 en 892090263316951044 1.501526e+12 \n", + "91 en 892068632011866112 1.501521e+12 \n", + "92 en 892132474221973505 1.501536e+12 \n", + "93 en 892072277939896320 1.501522e+12 \n", + "94 en 892090231658344450 1.501526e+12 \n", + "95 en 892089678127673344 1.501526e+12 \n", + "96 en 892089478013218817 1.501526e+12 \n", + "97 en 892087536553787392 1.501525e+12 \n", + "98 en 892089498548318208 1.501526e+12 \n", + "99 en 892090224800608256 1.501526e+12 \n", + "\n", + "[100 rows x 9 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = query_labeled_posts(0, 0)\n", + "pp(df, limit=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "StructType(List(StructField(post_id,StringType,true),StructField(_tag,StringType,true)))\n", + "count: 11179\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
post_id_tag
0892091452196519936resist
1892091452196519936fillibustergorsuch
2892091452196519936trumprussia
3892091452196519936billions
4892089765998116864religiouszealot
5892132086399856643skinnyrepeal
6892132086399856643resist
7892132086399856643trumprussia
8892091269366910976jerusalem
9892072546614595584wouldyoumind
\n", + "
" + ], + "text/plain": [ + " post_id _tag\n", + "0 892091452196519936 resist\n", + "1 892091452196519936 fillibustergorsuch\n", + "2 892091452196519936 trumprussia\n", + "3 892091452196519936 billions\n", + "4 892089765998116864 religiouszealot\n", + "5 892132086399856643 skinnyrepeal\n", + "6 892132086399856643 resist\n", + "7 892132086399856643 trumprussia\n", + "8 892091269366910976 jerusalem\n", + "9 892072546614595584 wouldyoumind" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# duplicate post_ids for each hashtag, expanded to rows. \n", + "# lowercased hashtags.\n", + "df2 = df.select('post_id', F.explode('hashtags').alias('tag'))\n", + "df2 = df2.withColumn('_tag', F.lower(F.col('tag')))\n", + "df2 = df2.drop('tag')\n", + "pp(df2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "StructType(List(StructField(_tag,StringType,true),StructField(post_ids,ArrayType(StringType,true),true),StructField(cnt_post_ids,LongType,false)))\n", + "count: 4585\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_tagpost_idscnt_post_ids
0trump[892130311353491456, 892089069508886528, 892091196536958976, 892089022285324288, 892091257257938945, 892131104123113473, 892131967268933632, 892091382743134208, 892089570493431810, 892132402503483392, 892090418946605057, 892130277237161986, 892090944748752896, 892091096368529410, 892090195515916288, 892131105599614976, 892080208022347776, 892132395532656641, 892130395193356289, 892131881499725824, 892130759691075584, 892089535739424768, 892089998459252737, 892091175976538112, 892131874780393478, 892131652230553600, 892130634101129217, 892132140657332225, 892131326723334146, 892089159350878209, 892132206914752513, 892131672229122051, 892130753018028036, 892090696831836161, 892131915842629632, 892131112885026816, 892130365015339008, 892089751209218048, 892090845263953921, 892130304953122...739
1scaramucci[892132262958977024, 892130594272014340, 892130254055239680, 892131699118587904, 892130764002865156, 892131779091591168, 892131881499725824, 892132439467982848, 892131963129270273, 892131668160647168, 892132070742306820, 892131745591635970, 892130597551955968, 892131874780393478, 892131652230553600, 892131715367542786, 892132367644622848, 892132167022596096, 892131672229122051, 892131758199762944, 892130753018028036, 892131915842629632, 892132131476045824, 892130304953122816, 892131699647184896, 892130322510434305, 892130547413037056, 892131402136813573, 892130353967767552, 892090646923796480, 892130730796625920, 892131465131110400, 892130792616452098, 892130898749083648, 892131397791604736, 892130916197388291, 892130312867651585, 892131167604006913, 892132151612801024, 892130721585930...335
2maga[892089115877015552, 892086778278105089, 892078956500975619, 892091135782465537, 892081029984530432, 892091312677244928, 892075919497625601, 892089445238923264, 892090702770847744, 892089535739424768, 892130248632004609, 892090049004744704, 892089417237749761, 892090904496017408, 892079467153297408, 892131590750580738, 892089763620159489, 892131112885026816, 892090854483140608, 892090845263953921, 892131043448422400, 892089943136370688, 892091364539760640, 892079812172554240, 892089538692210688, 892131118199312387, 892089447369523204, 892090701487501313, 892088983395733504, 892090730876895233, 892089166728753152, 892079523264471041, 892089773413847041, 892089148626087937, 892130288360275968, 892089104451616770, 892078291208871936, 892077078744576000, 892091528100929536, 892131057943883...284
3police[892076286184640512, 892090571954761728, 892077117839736832, 892070379296096256, 892073181967785984, 892077226241425408, 892080769237233668, 892071978978484224, 892068426864373762, 892080386607546369, 892070868423237633, 892130575074676737, 892073229480849412, 892079998869360646, 892130362809307137, 892079097689624576, 892087419524304897, 892070868309770240, 892070640555094016, 892130738954530818, 892068911935504385, 892077836248416256, 892086803108331524, 892077359615983616, 892068303648116736, 892071904428687360, 892069284414971904, 892074606005178368, 892078796454723584, 892074228488355841, 892072684061945861, 892131299086942209, 892080434972160001, 892090332782907392, 892072110935486464, 892071928600580096, 892077014265495553, 892068646046117888, 892073173742735364, 892132118888951...171
4noconfederate[892073427040952321, 892089404931555328, 892079622162186240, 892080277173919744, 892073268408090625, 892078262825910272, 892071862334652416, 892090273936887815, 892072872159707137, 892078736312500224, 892076008601337856, 892077537055997952, 892078196128071684, 892074034661380096, 892071832010063877, 892077880779259905, 892070024311181313, 892077153411514370, 892074414564675584, 892086556617519104, 892071294413541376, 892070671794163712, 892080681710497792, 892078599469137921, 892076690322538496, 892068390986285056, 892079270729846785, 892131451545817088, 892072749451096068, 892073511258386434, 892068302176104457, 892075876816498689, 892087929572610048, 892069391826898944, 892073434938826757, 892077150735659008, 892074095466119168, 892079681159278592, 892079599173091329, 892077250937573...87
5tcot[892090848577560577, 892091257257938945, 892091503903883264, 892067993739505666, 892131454536339456, 892090848938209280, 892091348584673280, 892072125137375235, 892090354631151618, 892076543119421441, 892090149885968384, 892089173477400576, 892090228864843777, 892132213701193729, 892131846674415617, 892131450094485504, 892089535739424768, 892131312789860353, 892131116026667008, 892090226939760640, 892090236955762688, 892090843242393602, 892090866751418368, 892086542491156482, 892080458510589952, 892131788310560768, 892132443372875778, 892131112885026816, 892090854483140608, 892090617358041090, 892090845263953921, 892089924962406400, 892090446238932993, 892131955738763266, 892089397482594304, 892076531815772160, 892089538692210688, 892131118199312387, 892070986178232321, 892090695091183...85
6resist[892089338799902720, 892089460631982080, 892132445495074816, 892090427532341248, 892089526788730880, 892131591274868737, 892132416747425793, 892131640922710016, 892088505719021570, 892074430108663808, 892132228335099905, 892074128076832769, 892131401478402048, 892070817994952704, 892078757921652736, 892077359615983616, 892088993453670400, 892131567044358144, 892089566198419456, 892130609329557505, 892078940612722689, 892132131476045824, 892132086399856643, 892130760093650944, 892090473548058624, 892090250020831232, 892090646923796480, 892131876902768641, 892131769469763584, 892089900790497280, 892131283391967237, 892091165914411008, 892131747323797504, 892090384440066050, 892130322892156929, 892132339668496384, 892130291791179776, 892073605575651328, 892131879868026880, 892091073505382...72
7trumprussia[892089325156007937, 892089553489678336, 892088911857676289, 892132103764287488, 892088919948488704, 892089353333112832, 892091276073598976, 892130262347206656, 892090663151554560, 892089485479071744, 892089858231087106, 892131149354586114, 892089516550483968, 892090299362705408, 892130341762347008, 892131423506886656, 892090140339863554, 892089668509908992, 892088891783688192, 892090575096352768, 892089035145060353, 892089566198419456, 892131705070465024, 892131963015909376, 892132116116504581, 892132131476045824, 892132086399856643, 892089446744633344, 892088901522800648, 892091434748321798, 892089617540722689, 892090971856355328, 892090745452204032, 892089136210997252, 892089476603822080, 892088944120270849, 892090384440066050, 892090821914263553, 892089840543694850, 892090712472272...65
8theresistance[892131274264989696, 892130442002006017, 892131062016561152, 892090730876895233, 892089460631982080, 892073605575651328, 892130944370528256, 892068692371976192, 892132103764287488, 892088966152900608, 892080208022347776, 892131665341849600, 892089494169694208, 892132098739503110, 892090159277232133, 892089240577753088, 892090553055285249, 892088978098315269, 892088971945164800, 892130578979373057, 892130627205554177, 892132228335099905, 892091461684035584, 892086860008349697, 892131272054755332, 892132428432773120, 892089969283661825, 892088875404869635, 892088960519987200, 892131798146203648, 892074524048474112, 892089411059433476, 892090007682527232, 892088993453670400, 892089774818721792, 892132116116504581, 892080976481988608, 892130877769003008, 892130331490254848, 892088952492097...54
9p2[892089338799902720, 892090418518740996, 892131879868026880, 892130572432154625, 892090427532341248, 892089036554338304, 892091226425401344, 892072125137375235, 892130716636504064, 892091513995378689, 892091425755734016, 892130252591316992, 892132213701193729, 892130273206296576, 892090069175209988, 892131846674415617, 892090153442975745, 892131702566531078, 892090470410715138, 892090049004744704, 892130725046005761, 892090236955762688, 892090180051628036, 892089498548318208, 892130550684696577, 892090739894738945, 892090176801034240, 892130522129985536, 892089984823570433, 892090672194482176, 892071446410809344, 892070628349685760, 892130880080052224, 892090446238932993, 892132283087548417, 892088928437755904, 892089900790497280, 892088959341395968, 892132005235830784, 892090652279701...49
\n", + "
" + ], + "text/plain": [ + " _tag \\\n", + "0 trump \n", + "1 scaramucci \n", + "2 maga \n", + "3 police \n", + "4 noconfederate \n", + "5 tcot \n", + "6 resist \n", + "7 trumprussia \n", + "8 theresistance \n", + "9 p2 \n", + "\n", + " post_ids \\\n", + "0 [892130311353491456, 892089069508886528, 892091196536958976, 892089022285324288, 892091257257938945, 892131104123113473, 892131967268933632, 892091382743134208, 892089570493431810, 892132402503483392, 892090418946605057, 892130277237161986, 892090944748752896, 892091096368529410, 892090195515916288, 892131105599614976, 892080208022347776, 892132395532656641, 892130395193356289, 892131881499725824, 892130759691075584, 892089535739424768, 892089998459252737, 892091175976538112, 892131874780393478, 892131652230553600, 892130634101129217, 892132140657332225, 892131326723334146, 892089159350878209, 892132206914752513, 892131672229122051, 892130753018028036, 892090696831836161, 892131915842629632, 892131112885026816, 892130365015339008, 892089751209218048, 892090845263953921, 892130304953122... \n", + "1 [892132262958977024, 892130594272014340, 892130254055239680, 892131699118587904, 892130764002865156, 892131779091591168, 892131881499725824, 892132439467982848, 892131963129270273, 892131668160647168, 892132070742306820, 892131745591635970, 892130597551955968, 892131874780393478, 892131652230553600, 892131715367542786, 892132367644622848, 892132167022596096, 892131672229122051, 892131758199762944, 892130753018028036, 892131915842629632, 892132131476045824, 892130304953122816, 892131699647184896, 892130322510434305, 892130547413037056, 892131402136813573, 892130353967767552, 892090646923796480, 892130730796625920, 892131465131110400, 892130792616452098, 892130898749083648, 892131397791604736, 892130916197388291, 892130312867651585, 892131167604006913, 892132151612801024, 892130721585930... \n", + "2 [892089115877015552, 892086778278105089, 892078956500975619, 892091135782465537, 892081029984530432, 892091312677244928, 892075919497625601, 892089445238923264, 892090702770847744, 892089535739424768, 892130248632004609, 892090049004744704, 892089417237749761, 892090904496017408, 892079467153297408, 892131590750580738, 892089763620159489, 892131112885026816, 892090854483140608, 892090845263953921, 892131043448422400, 892089943136370688, 892091364539760640, 892079812172554240, 892089538692210688, 892131118199312387, 892089447369523204, 892090701487501313, 892088983395733504, 892090730876895233, 892089166728753152, 892079523264471041, 892089773413847041, 892089148626087937, 892130288360275968, 892089104451616770, 892078291208871936, 892077078744576000, 892091528100929536, 892131057943883... \n", + "3 [892076286184640512, 892090571954761728, 892077117839736832, 892070379296096256, 892073181967785984, 892077226241425408, 892080769237233668, 892071978978484224, 892068426864373762, 892080386607546369, 892070868423237633, 892130575074676737, 892073229480849412, 892079998869360646, 892130362809307137, 892079097689624576, 892087419524304897, 892070868309770240, 892070640555094016, 892130738954530818, 892068911935504385, 892077836248416256, 892086803108331524, 892077359615983616, 892068303648116736, 892071904428687360, 892069284414971904, 892074606005178368, 892078796454723584, 892074228488355841, 892072684061945861, 892131299086942209, 892080434972160001, 892090332782907392, 892072110935486464, 892071928600580096, 892077014265495553, 892068646046117888, 892073173742735364, 892132118888951... \n", + "4 [892073427040952321, 892089404931555328, 892079622162186240, 892080277173919744, 892073268408090625, 892078262825910272, 892071862334652416, 892090273936887815, 892072872159707137, 892078736312500224, 892076008601337856, 892077537055997952, 892078196128071684, 892074034661380096, 892071832010063877, 892077880779259905, 892070024311181313, 892077153411514370, 892074414564675584, 892086556617519104, 892071294413541376, 892070671794163712, 892080681710497792, 892078599469137921, 892076690322538496, 892068390986285056, 892079270729846785, 892131451545817088, 892072749451096068, 892073511258386434, 892068302176104457, 892075876816498689, 892087929572610048, 892069391826898944, 892073434938826757, 892077150735659008, 892074095466119168, 892079681159278592, 892079599173091329, 892077250937573... \n", + "5 [892090848577560577, 892091257257938945, 892091503903883264, 892067993739505666, 892131454536339456, 892090848938209280, 892091348584673280, 892072125137375235, 892090354631151618, 892076543119421441, 892090149885968384, 892089173477400576, 892090228864843777, 892132213701193729, 892131846674415617, 892131450094485504, 892089535739424768, 892131312789860353, 892131116026667008, 892090226939760640, 892090236955762688, 892090843242393602, 892090866751418368, 892086542491156482, 892080458510589952, 892131788310560768, 892132443372875778, 892131112885026816, 892090854483140608, 892090617358041090, 892090845263953921, 892089924962406400, 892090446238932993, 892131955738763266, 892089397482594304, 892076531815772160, 892089538692210688, 892131118199312387, 892070986178232321, 892090695091183... \n", + "6 [892089338799902720, 892089460631982080, 892132445495074816, 892090427532341248, 892089526788730880, 892131591274868737, 892132416747425793, 892131640922710016, 892088505719021570, 892074430108663808, 892132228335099905, 892074128076832769, 892131401478402048, 892070817994952704, 892078757921652736, 892077359615983616, 892088993453670400, 892131567044358144, 892089566198419456, 892130609329557505, 892078940612722689, 892132131476045824, 892132086399856643, 892130760093650944, 892090473548058624, 892090250020831232, 892090646923796480, 892131876902768641, 892131769469763584, 892089900790497280, 892131283391967237, 892091165914411008, 892131747323797504, 892090384440066050, 892130322892156929, 892132339668496384, 892130291791179776, 892073605575651328, 892131879868026880, 892091073505382... \n", + "7 [892089325156007937, 892089553489678336, 892088911857676289, 892132103764287488, 892088919948488704, 892089353333112832, 892091276073598976, 892130262347206656, 892090663151554560, 892089485479071744, 892089858231087106, 892131149354586114, 892089516550483968, 892090299362705408, 892130341762347008, 892131423506886656, 892090140339863554, 892089668509908992, 892088891783688192, 892090575096352768, 892089035145060353, 892089566198419456, 892131705070465024, 892131963015909376, 892132116116504581, 892132131476045824, 892132086399856643, 892089446744633344, 892088901522800648, 892091434748321798, 892089617540722689, 892090971856355328, 892090745452204032, 892089136210997252, 892089476603822080, 892088944120270849, 892090384440066050, 892090821914263553, 892089840543694850, 892090712472272... \n", + "8 [892131274264989696, 892130442002006017, 892131062016561152, 892090730876895233, 892089460631982080, 892073605575651328, 892130944370528256, 892068692371976192, 892132103764287488, 892088966152900608, 892080208022347776, 892131665341849600, 892089494169694208, 892132098739503110, 892090159277232133, 892089240577753088, 892090553055285249, 892088978098315269, 892088971945164800, 892130578979373057, 892130627205554177, 892132228335099905, 892091461684035584, 892086860008349697, 892131272054755332, 892132428432773120, 892089969283661825, 892088875404869635, 892088960519987200, 892131798146203648, 892074524048474112, 892089411059433476, 892090007682527232, 892088993453670400, 892089774818721792, 892132116116504581, 892080976481988608, 892130877769003008, 892130331490254848, 892088952492097... \n", + "9 [892089338799902720, 892090418518740996, 892131879868026880, 892130572432154625, 892090427532341248, 892089036554338304, 892091226425401344, 892072125137375235, 892130716636504064, 892091513995378689, 892091425755734016, 892130252591316992, 892132213701193729, 892130273206296576, 892090069175209988, 892131846674415617, 892090153442975745, 892131702566531078, 892090470410715138, 892090049004744704, 892130725046005761, 892090236955762688, 892090180051628036, 892089498548318208, 892130550684696577, 892090739894738945, 892090176801034240, 892130522129985536, 892089984823570433, 892090672194482176, 892071446410809344, 892070628349685760, 892130880080052224, 892090446238932993, 892132283087548417, 892088928437755904, 892089900790497280, 892088959341395968, 892132005235830784, 892090652279701... \n", + "\n", + " cnt_post_ids \n", + "0 739 \n", + "1 335 \n", + "2 284 \n", + "3 171 \n", + "4 87 \n", + "5 85 \n", + "6 72 \n", + "7 65 \n", + "8 54 \n", + "9 49 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# group by hashtag, count and collect post ids. sort by count.\n", + "df3=df2.groupby('_tag')\\\n", + " .agg(\n", + " F.collect_set('post_id').alias('post_ids'),\n", + " F.count('post_id').alias('cnt_post_ids')\n", + " )\\\n", + ".sort('cnt_post_ids', ascending=False)\n", + "\n", + "pp(df3)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_tagpost_idscnt_post_ids
0trump[892130311353491456, 892089069508886528, 892091196536958976, 892089022285324288, 892091257257938945, 892131104123113473, 892131967268933632, 892091382743134208, 892089570493431810, 892132402503483392, 892090418946605057, 892130277237161986, 892090944748752896, 892091096368529410, 892090195515916288, 892131105599614976, 892080208022347776, 892132395532656641, 892130395193356289, 892131881499725824, 892130759691075584, 892089535739424768, 892089998459252737, 892091175976538112, 892131874780393478, 892131652230553600, 892130634101129217, 892132140657332225, 892131326723334146, 892089159350878209, 892132206914752513, 892131672229122051, 892130753018028036, 892090696831836161, 892131915842629632, 892131112885026816, 892130365015339008, 892089751209218048, 892090845263953921, 892130304953122...739
1scaramucci[892132262958977024, 892130594272014340, 892130254055239680, 892131699118587904, 892130764002865156, 892131779091591168, 892131881499725824, 892132439467982848, 892131963129270273, 892131668160647168, 892132070742306820, 892131745591635970, 892130597551955968, 892131874780393478, 892131652230553600, 892131715367542786, 892132367644622848, 892132167022596096, 892131672229122051, 892131758199762944, 892130753018028036, 892131915842629632, 892132131476045824, 892130304953122816, 892131699647184896, 892130322510434305, 892130547413037056, 892131402136813573, 892130353967767552, 892090646923796480, 892130730796625920, 892131465131110400, 892130792616452098, 892130898749083648, 892131397791604736, 892130916197388291, 892130312867651585, 892131167604006913, 892132151612801024, 892130721585930...335
2maga[892089115877015552, 892086778278105089, 892078956500975619, 892091135782465537, 892081029984530432, 892091312677244928, 892075919497625601, 892089445238923264, 892090702770847744, 892089535739424768, 892130248632004609, 892090049004744704, 892089417237749761, 892090904496017408, 892079467153297408, 892131590750580738, 892089763620159489, 892131112885026816, 892090854483140608, 892090845263953921, 892131043448422400, 892089943136370688, 892091364539760640, 892079812172554240, 892089538692210688, 892131118199312387, 892089447369523204, 892090701487501313, 892088983395733504, 892090730876895233, 892089166728753152, 892079523264471041, 892089773413847041, 892089148626087937, 892130288360275968, 892089104451616770, 892078291208871936, 892077078744576000, 892091528100929536, 892131057943883...284
3police[892076286184640512, 892090571954761728, 892077117839736832, 892070379296096256, 892073181967785984, 892077226241425408, 892080769237233668, 892071978978484224, 892068426864373762, 892080386607546369, 892070868423237633, 892130575074676737, 892073229480849412, 892079998869360646, 892130362809307137, 892079097689624576, 892087419524304897, 892070868309770240, 892070640555094016, 892130738954530818, 892068911935504385, 892077836248416256, 892086803108331524, 892077359615983616, 892068303648116736, 892071904428687360, 892069284414971904, 892074606005178368, 892078796454723584, 892074228488355841, 892072684061945861, 892131299086942209, 892080434972160001, 892090332782907392, 892072110935486464, 892071928600580096, 892077014265495553, 892068646046117888, 892073173742735364, 892132118888951...171
4noconfederate[892073427040952321, 892089404931555328, 892079622162186240, 892080277173919744, 892073268408090625, 892078262825910272, 892071862334652416, 892090273936887815, 892072872159707137, 892078736312500224, 892076008601337856, 892077537055997952, 892078196128071684, 892074034661380096, 892071832010063877, 892077880779259905, 892070024311181313, 892077153411514370, 892074414564675584, 892086556617519104, 892071294413541376, 892070671794163712, 892080681710497792, 892078599469137921, 892076690322538496, 892068390986285056, 892079270729846785, 892131451545817088, 892072749451096068, 892073511258386434, 892068302176104457, 892075876816498689, 892087929572610048, 892069391826898944, 892073434938826757, 892077150735659008, 892074095466119168, 892079681159278592, 892079599173091329, 892077250937573...87
5tcot[892090848577560577, 892091257257938945, 892091503903883264, 892067993739505666, 892131454536339456, 892090848938209280, 892091348584673280, 892072125137375235, 892090354631151618, 892076543119421441, 892090149885968384, 892089173477400576, 892090228864843777, 892132213701193729, 892131846674415617, 892131450094485504, 892089535739424768, 892131312789860353, 892131116026667008, 892090226939760640, 892090236955762688, 892090843242393602, 892090866751418368, 892086542491156482, 892080458510589952, 892131788310560768, 892132443372875778, 892131112885026816, 892090854483140608, 892090617358041090, 892090845263953921, 892089924962406400, 892090446238932993, 892131955738763266, 892089397482594304, 892076531815772160, 892089538692210688, 892131118199312387, 892070986178232321, 892090695091183...85
6resist[892089338799902720, 892089460631982080, 892132445495074816, 892090427532341248, 892089526788730880, 892131591274868737, 892132416747425793, 892131640922710016, 892088505719021570, 892074430108663808, 892132228335099905, 892074128076832769, 892131401478402048, 892070817994952704, 892078757921652736, 892077359615983616, 892088993453670400, 892131567044358144, 892089566198419456, 892130609329557505, 892078940612722689, 892132131476045824, 892132086399856643, 892130760093650944, 892090473548058624, 892090250020831232, 892090646923796480, 892131876902768641, 892131769469763584, 892089900790497280, 892131283391967237, 892091165914411008, 892131747323797504, 892090384440066050, 892130322892156929, 892132339668496384, 892130291791179776, 892073605575651328, 892131879868026880, 892091073505382...72
7trumprussia[892089325156007937, 892089553489678336, 892088911857676289, 892132103764287488, 892088919948488704, 892089353333112832, 892091276073598976, 892130262347206656, 892090663151554560, 892089485479071744, 892089858231087106, 892131149354586114, 892089516550483968, 892090299362705408, 892130341762347008, 892131423506886656, 892090140339863554, 892089668509908992, 892088891783688192, 892090575096352768, 892089035145060353, 892089566198419456, 892131705070465024, 892131963015909376, 892132116116504581, 892132131476045824, 892132086399856643, 892089446744633344, 892088901522800648, 892091434748321798, 892089617540722689, 892090971856355328, 892090745452204032, 892089136210997252, 892089476603822080, 892088944120270849, 892090384440066050, 892090821914263553, 892089840543694850, 892090712472272...65
8theresistance[892131274264989696, 892130442002006017, 892131062016561152, 892090730876895233, 892089460631982080, 892073605575651328, 892130944370528256, 892068692371976192, 892132103764287488, 892088966152900608, 892080208022347776, 892131665341849600, 892089494169694208, 892132098739503110, 892090159277232133, 892089240577753088, 892090553055285249, 892088978098315269, 892088971945164800, 892130578979373057, 892130627205554177, 892132228335099905, 892091461684035584, 892086860008349697, 892131272054755332, 892132428432773120, 892089969283661825, 892088875404869635, 892088960519987200, 892131798146203648, 892074524048474112, 892089411059433476, 892090007682527232, 892088993453670400, 892089774818721792, 892132116116504581, 892080976481988608, 892130877769003008, 892130331490254848, 892088952492097...54
9p2[892089338799902720, 892090418518740996, 892131879868026880, 892130572432154625, 892090427532341248, 892089036554338304, 892091226425401344, 892072125137375235, 892130716636504064, 892091513995378689, 892091425755734016, 892130252591316992, 892132213701193729, 892130273206296576, 892090069175209988, 892131846674415617, 892090153442975745, 892131702566531078, 892090470410715138, 892090049004744704, 892130725046005761, 892090236955762688, 892090180051628036, 892089498548318208, 892130550684696577, 892090739894738945, 892090176801034240, 892130522129985536, 892089984823570433, 892090672194482176, 892071446410809344, 892070628349685760, 892130880080052224, 892090446238932993, 892132283087548417, 892088928437755904, 892089900790497280, 892088959341395968, 892132005235830784, 892090652279701...49
10venezuela[892071382581903361, 892090993348136960, 892073903752974336, 892071666335068160, 892087968294354944, 892090560781180928, 892074709470388225, 892088516464721920, 892076791791312896, 892090587326943233, 892089795429646341, 892090316483952640, 892090605941256192, 892078144873803777, 892089398501801984, 892072450975870977, 892087388809314305, 892073692179685376, 892131081838878720, 892089891630202882, 892071320594378752, 892089848080814081, 892130428043223040, 892068786093903873, 892131776927330304, 892068123775512576, 892087931443093504, 892073716431106048, 892086898004336640, 892131834863202306, 892071199731273730, 892130475212451840, 892068318487695360, 892074988236308481, 892089182885220354, 892072698460950528, 892070397079957506, 892072491472048128, 892071978617667584, 892090740968435...48
11mooch[892130763541553154, 892131216752865281, 892131938563182594, 892131364400766978, 892131095256477697, 892130459710140416, 892132120398766080, 892131143302221824, 892132054241918976, 892130845280153600, 892132171934203904, 892130915144617984, 892130963584688139, 892130934769799168, 892131477860876288, 892131351431974912, 892131894254485504, 892130839261310977, 892130304374317056, 892130597551955968, 892132262099210241, 892132311185203201, 892131565286948872, 892132190808670208, 892131285727981568, 892131650758463488, 892131866429489157, 892132303031463936, 892090646923796480, 892131847362293763, 892130323236093952, 892130245674913792, 892131230245847040, 892131642705313795, 892131474023034881, 892132416701177856, 892131172557443072, 892131987779076096, 892132431498760193, 892130874426372...47
12blacklivesmatter[892075071484813312, 892069987040600065, 892068121594474497, 892072221895798784, 892072469665906688, 892078233541390337, 892086710217187329, 892079202723352576, 892070473864945664, 892080909033365505, 892075021815959556, 892087990171963392, 892073511061147648, 892087121439338497, 892071425544146944, 892069651118604288, 892069481685692417, 892077504810373124, 892089984823570433, 892071292958060544, 892086858443886592, 892074646123577344, 892070377953808384, 892067979793281025, 892074262080745478, 892070369955385344, 892087572842905600, 892068152913350656, 892068091986931713, 892068362276061184, 892069151891521536, 892068065667457024, 892069976215048193, 892090164075532289, 892071064464969729, 892068750601703424, 892079870175567873, 892078913517740033, 892090996892114944, 892130925538050...43
13whitehouse[892132305627742208, 892130725495009281, 892132175147147265, 892131387846922241, 892131938563182594, 892131283803021312, 892090828344115200, 892130605600829443, 892091109815590912, 892090409127624704, 892131580449370116, 892132351790264321, 892130319847084034, 892131343798214656, 892131456331522048, 892131257446019072, 892131093553565696, 892131894254485504, 892131319106473984, 892132455905345536, 892131020052582401, 892132070742306820, 892131652230553600, 892130393088036865, 892132097917431808, 892130873289658368, 892131521422905347, 892131967508066304, 892131567044358144, 892130718939308032, 892132026287104002, 892090971944517637, 892131558651564037, 892089277160579072, 892131032836780037, 892132270441734145, 892132148014186496, 892130667210952710, 892130466052145152, 892132227932332...41
14ripmsando[892068519550091266, 892088181432233988, 892069410416066561, 892071790519996416, 892068903555407872, 892070176249831424, 892070726869467136, 892071419881893889, 892076501461610497, 892078115601743874, 892073016162758656, 892088436533932032, 892071932153262080, 892078987400409088, 892077312803471360, 892074450308517891, 892088389700395009, 892068969674416131, 892090256518000640, 892075719517458433, 892089653481938944, 892088445920829441, 892069391835299840, 892091498522697729, 892077858465746944, 892079000360824833, 892077993329180672, 892072713245872128, 892090951426076672, 892089779243843585, 892090565990506496, 892086761823797249, 892070015419244545, 892090460273094656, 892087394979241985, 892069700557033474, 892076568255836160, 892069719418839041]40
15obamacare[892131607041130497, 892088935572230144, 892130656326733825, 892089917744058370, 892089194713038852, 892090093283848192, 892130277237161986, 892089211368722433, 892130581332582400, 892132386938523650, 892131805410844673, 892090553055285249, 892088905859727360, 892130308488810496, 892078703185973248, 892130662668546051, 892132090686341120, 892132290503077888, 892131686489653249, 892089969283661825, 892131454406324224, 892089411059433476, 892130502307721216, 892089875658309632, 892091428448473089, 892131757981655040, 892089112408338432, 892130760093650944, 892130353967767552, 892089666085806080, 892089385151340544, 892132329547853824, 892090870262050817, 892090380711329792, 892090271751700481, 892131890551033856, 892130523241340929, 892131867855536128]39
16mondaymotivation[892089447210250240, 892091258923044865, 892130605600829443, 892068319410257920, 892079995719368704, 892089099099799552, 892089105064091650, 892090553055285249, 892131743158939648, 892088964550676481, 892090380195213312, 892088725479596032, 892090690670407680, 892091204854308867, 892131312789860353, 892091087078260736, 892091530323906560, 892088890206568449, 892089296315928578, 892089906444484612, 892089969283661825, 892089643524661248, 892089411059433476, 892130492702523392, 892130718939308032, 892077904972197889, 892078641068331008, 892089860793749506, 892091437822627840, 892089681558425600, 892090792164171776, 892068457210163200, 892070630505533440, 892091501785829377, 892090894547120129, 892089389337260036, 892069059625398272]38
17usa[892130311353491456, 892131452288212992, 892072828757049346, 892075966687784961, 892132439778361344, 892090427490398209, 892090352504631297, 892090355839053829, 892090110950477824, 892132352788381696, 892089104074129410, 892091139217649666, 892130303967473664, 892077078744576000, 892070015876423682, 892079921358675970, 892132213701193729, 892131595326337024, 892131781901680640, 892068495780962304, 892131312789860353, 892067981957541888, 892073611640672256, 892131565085405184, 892090739894738945, 892090402634911748, 892068664219807744, 892068961529069568, 892131366149685248, 892090672194482176, 892067972675653634, 892090359760773120, 892130735158702080, 892090652279701504, 892090486755917824]35
18potus[892088414887178241, 892090136216731648, 892068215152619521, 892131539710038020, 892089303588888577, 892090151626838019, 892132175147147265, 892090342320865280, 892131387846922241, 892130345709076480, 892131318112296960, 892130303967473664, 892131284918702080, 892088949040177153, 892089573140029440, 892074291658981379, 892091199372353536, 892088894136733699, 892131385506451456, 892132026287104002, 892089107110866946, 892131519216484352, 892091326421909505, 892069864193630208, 892130870655565825, 892131642965229568, 892090148699201537, 892090606163566593, 892091397716791296, 892130874426372097, 892132403950518275, 892131599013343233]33
19brexit[892078735222091776, 892131669129482240, 892069813471899648, 892070357376651266, 892091078358249475, 892077267895156736, 892069611306450944, 892131474543177728, 892086921719107584, 892069223543033859, 892090332518678528, 892069483443093505, 892090855741370368, 892077262190804997, 892072096553193476, 892071515952435200, 892131502582104064, 892089594883297280, 892089898223759360, 892132384702955520, 892071274943586304, 892090877677621248, 892088880517844993, 892077406302986243, 892078037168205824, 892087776635695104, 892080535031468032, 892089497474789378, 892077066849484800, 892089338363883526, 892091428670771201, 892130335701557248, 892070006992887809]33
20nsfdw2017[892087523849195520, 892087068297494530, 892087068314259458, 892087067915812864, 892087068398170113, 892087109976305665, 892087523769495552, 892087067697713152, 892087068322607105, 892087068293201924, 892087067710279680, 892087067739643904, 892087067504779265, 892087067672444928, 892087523878588416, 892087523706630144, 892087067966156801, 892087068574314497, 892087523719229440, 892087068272283648, 892087067659964416, 892087523668881408, 892087068498767872, 892087067680940033, 892087068758880256, 892087067785777156, 892087068305891329, 892087067609583616, 892087067764809728, 892087067752247296, 892087068347826178, 892087068658200577, 892087067680935936]33
21pjnet[892091257257938945, 892067993739505666, 892132150417510408, 892080227827855360, 892091348584673280, 892130920035106816, 892077078744576000, 892088988353183745, 892090702770847744, 892091020535406592, 892131372592242690, 892130396774821889, 892076209605050368, 892090226939760640, 892131675194314752, 892086542491156482, 892080458510589952, 892130712630906880, 892089415467646979, 892130262796165120, 892090855212945408, 892088450295496709, 892132283087548417, 892073375283032064, 892089754224779264, 892070249872404481, 892088959341395968, 892090075185414144, 892132244722266112, 892131490191912960, 892131511935348737, 892090041845129216]32
22fakenews[892089060428312577, 892130804960305153, 892131434571456513, 892132311248064512, 892089465459556352, 892089313055326208, 892091025761734656, 892088881524260864, 892130958509473794, 892079976882876426, 892074928455004160, 892130820344999936, 892089841189609472, 892091560988463105, 892132006615797762, 892131487465717760, 892090732772810752, 892131540095905792, 892090347702165504, 892131222654267393, 892089120142614532, 892090140746797057, 892072446253293574, 892131338542829568, 892089678127673344, 892088965112705024, 892089023400783872, 892131520709894145, 892089013967896578]29
23pdx911[892075165214953472, 892079688952168449, 892078413401305088, 892079690273378304, 892070103235284992, 892076397262364674, 892078414693150720, 892069605509746688, 892090741110984704, 892077672267890688, 892071115375300608, 892088728780328960, 892088725034852352, 892075166792007680, 892088726481805312, 892089732561215488, 892132258332631041, 892089241173344257, 892070604936888320, 892068601292660736, 892079168564764672, 892075158554296320, 892074142488346624, 892080173234782208, 892072628004962305, 892072629493878784, 892080174547714049]27
24politics[892131871550652416, 892087832109559808, 892090352504631297, 892090355839053829, 892089567930494977, 892090623154679811, 892132166808764418, 892132284979044352, 892132140657332225, 892089031671939072, 892090354253647872, 892132418005729280, 892131294565588992, 892090762007117824, 892090269126021121, 892131008304336897, 892131199216345088, 892077420936925188, 892089561014300673, 892073022722514944, 892091023215734784, 892130273218895872, 892131450224607232, 892090270967361537, 892089774634303488]25
25mersal[892068819388313600, 892069677672693761, 892130496314003458, 892073561556369409, 892076907948265472, 892072919899283456, 892074903469408256, 892077741197082625, 892068249357225986, 892072984059428865, 892074515643080704, 892073132944539649, 892080288150331393, 892077483624980481, 892079466398208001, 892087382081654784, 892087525543600128, 892070393833439233, 892070703217885184, 892069822028107777, 892079124210106368, 892089836479299585, 892087298375827456]24
26protest[892075046759538688, 892091358927888384, 892074928513708032, 892091147874697219, 892068539309469697, 892071529009295360, 892071310330933249, 892089366801248256, 892078987526242304, 892076704289636352, 892091277805842433, 892077181601447936, 892068946052100097, 892078749184872448, 892091225758736386, 892070892578197504, 892091014768451585, 892091319513960448, 892087965199069185, 892077222747664385, 892073296967213056, 892086952912191492, 892091074877034497]24
27trumptrain[892091335490056196, 892132449852858368, 892089476260016131, 892088947014328320, 892090575096352768, 892131967268933632, 892132211197173760, 892089134466183169, 892090949827833856, 892089201893793795, 892132114757537792, 892091379244912640, 892090215610937344, 892132058390122496, 892090971856355328, 892075949776265217, 892090049004744704, 892130408032157696, 892089499169169408, 892089028195082240, 892131401478402048, 892091136050954242, 892130588030902272]23
28impeachtrump[892088875404869635, 892131528595124225, 892089342776279042, 892132051519983620, 892089440314826753, 892090007682527232, 892131394264088576, 892131745939816448, 892091346051379205, 892089049665687552, 892089913050517504, 892131612208623621, 892090103610445824, 892089389878198272, 892088953897066497, 892090160447225857, 892087629533065217, 892130830180466688, 892089852430344192, 892076120081911809, 892089187863859201, 892130604254457859]22
29shooting[892074203297579008, 892077326112092160, 892087513921335296, 892090469773185028, 892075163990106113, 892087327002177536, 892088528208879618, 892068399882223616, 892089189684191232, 892070833950261248, 892089608942612480, 892078395030331393, 892078672730923008, 892091436639731712, 892076011659067392, 892078040737497089, 892073026925318145, 892073216096714756, 892089840807936002, 892075563531345920, 892088892349919234, 892088198091943936]22
30nowplaying[892075689846951937, 892074619481579520, 892091531779338240, 892072547361189888, 892071746039422976, 892075830767112192, 892077202539397120, 892068399026753536, 892070015876423682, 892072239914418176, 892069322054656001, 892075145493233664, 892090052771237888, 892078418396884993, 892131999858733056, 892089732515209216, 892068365413568512, 892071036019200000, 892077410220470272, 892132017013436419, 892070558384390145, 892087199939739648]22
31cnn[892073559677206528, 892089782028845057, 892130291791179776, 892131104123113473, 892090672194482176, 892091212227788800, 892131110360014848, 892132445495074816, 892079097689624576, 892072446253293574, 892130674148163584, 892131283391967237, 892089807328886784, 892089720930541568, 892091303286145025, 892076709813473280, 892090486755917824, 892131112822030336, 892132255048577024, 892089799431012355, 892089640349556736]22
32iran[892130311353491456, 892069225472417793, 892068664219807744, 892087993833533440, 892068613942915072, 892132402503483392, 892086791783763969, 892070414343655424, 892071775458164737, 892069569388462080, 892091035928670208, 892075151831031808, 892068946052100097, 892070494467375105, 892130617399410690, 892070018992701440, 892088874272534528, 892090559594209280, 892073812220661760, 892070204179611648]22
33bharathanenenu[892070816690692096, 892069793569816577, 892079262118797312, 892077363743277056, 892070105693040640, 892071613633511424, 892072666068209664, 892078933818064896, 892078142088785923, 892072687224233985, 892070433276612608, 892070058343546880, 892076040293564417, 892074418217680896, 892069142416814080, 892070141889921024, 892073106717556736, 892073166557728769, 892069209269714946, 892074216417239040]21
34whitehousechaos[892131402661212160, 892131680395431940, 892130464814596096, 892131202567512065, 892131225279889408, 892131734002704384, 892089710176333826, 892090107662131200, 892131259761152000, 892131386781519872, 892090231658344450, 892131118295773186, 892089551346421760, 892130555688607744, 892131651169525764, 892132366495342592, 892131052579434496, 892132422392918018]21
35aiimsdocnamestharoor[892070171786870784, 892079941671686145, 892068300334739456, 892076743866986496, 892089257531129856, 892073526491987968, 892080354189647872, 892068715419676672, 892068642782949376, 892089928204660737, 892075427220639744, 892077976430428162, 892077994214277122, 892078468518756359, 892070580060553216, 892080295733739520, 892068139495600130, 892068723816767488, 892079899783045120]21
36crime[892088998696497155, 892091457754025984, 892076411275575296, 892070201033977857, 892071191883784193, 892086899153797121, 892130823008387072, 892075021232783360, 892067972675653634, 892075963806339072, 892090159117852674, 892091454746767360, 892079816220057602, 892087994945077248, 892075552990953473, 892067981957541888, 892087723573620737, 892087374091620361, 892078495945412608, 892074390548074496, 892070460590084096]21
37northkorea[892090962469670912, 892130492702523392, 892130365015339008, 892090004343857154, 892090387275403264, 892087544250220548, 892090445278453760, 892089105064091650, 892070494467375105, 892091263272591360, 892091204854308867, 892090778100523008, 892130975928471556, 892091237343297538, 892131288974602240, 892090642486231044, 892132122013683712, 892089028861939712, 892089860974096384, 892091082984620032]20
38donaldtrump[892078937622233088, 892090457290833920, 892072828757049346, 892089303588888577, 892091196536958976, 892090156672528384, 892090346326437889, 892132253433769990, 892131283803021312, 892131685134979073, 892131347732541441, 892131881499725824, 892089381426798592, 892131531157757952, 892091397716791296, 892131792626610179, 892089545289871360, 892090346280300544, 892131278283313155, 892088930400686080]20
39hdtalk[892089782028845057, 892089031671939072, 892091548258648064, 892131104123113473, 892132418005729280, 892091212227788800, 892131110360014848, 892131122687098880, 892068709820280832, 892131085676457984, 892089861464670208, 892089871145156608, 892089807328886784, 892077707936251904, 892078757485453319, 892091303286145025, 892131112822030336, 892089640349556736, 892089799431012355]19
40aca[892130661859024897, 892080473463193603, 892089608061698052, 892089984823570433, 892130572432154625, 892089787103969280, 892130760093650944, 892131190500724736, 892089120985673728, 892130252591316992, 892089473420464128, 892091208511746048, 892090527931400195, 892132192809345027, 892091089292730368, 892089307342684164, 892131686489653249, 892130550684696577]19
41mtvhottest[892073504379686914, 892075967115558916, 892073710001283077, 892079811300139008, 892073694939549696, 892091020636299264, 892074743117082626, 892077023132360709, 892074662242471940, 892091043021193217, 892091393535086593, 892074813543632896, 892091560648749056, 892074600091324416, 892090763819044864, 892072493393096706, 892071529953079297, 892091452393754624, 892073690854195200]19
42comey[892090589839282176, 892131103246614528, 892130283994140672, 892131285727981568, 892131364400766978, 892132256730558464, 892131603643744256, 892130637120974849, 892089379711205378, 892130610596245504, 892090566107910145, 892130803332861954, 892131283391967237, 892132055781449731, 892090349736349696, 892131682928783360, 892130634101129217, 892091506516819968]18
43russia[892130563900850178, 892091249292963842, 892131743121133568, 892132463190958082, 892089812076900356, 892130899239763969, 892130332475924480, 892090335328862208, 892088128323956737, 892131610904100864, 892089251407515649, 892091026713739264, 892090309840179202, 892089408400355328, 892132122013683712, 892089501006397445, 892131272054755332, 892087977412890625]18
44kelly[892130378193846272, 892132007521775621, 892132097703522304, 892132119492698112, 892131387846922241, 892091103687593984, 892131908938862592, 892132081563611136, 892130819569061891, 892132256730558464, 892132395532656641, 892132439467982848, 892131694739939328, 892130253660975104, 892130839261310977, 892131283391967237, 892130789659467778]18
45themooch[892130732461744129, 892131128785723392, 892130559811387392, 892130510209765376, 892130644985339904, 892131734002704384, 892130877769003008, 892131357392064512, 892132439467982848, 892132396560252930, 892130998598725632, 892131939750162432, 892131894254485504, 892130721585930241, 892130852146208768, 892131745591635970, 892130286712086530, 892131695725387777]18
46msnbc[892130291791179776, 892089795345870848, 892132445495074816, 892090624001933313, 892070377953808384, 892080707388026880, 892131797298855936, 892130681584746497, 892130924984360960, 892079097689624576, 892130674148163584, 892131283391967237, 892090294400937985, 892131384705396736, 892071064464969729, 892132107039916034, 892132255048577024]17
47c4news[892090872002732034, 892091156670152708, 892091246755315712, 892091347406139392, 892090194601672704, 892091397955878912, 892089020741812224, 892091034687209472, 892089092766388224, 892090424378183681, 892089196231491585, 892090668776083457, 892090951967154176, 892090989011247104, 892089593541140480, 892090553814396928, 892090821104873473]17
48americafirst[892073223319429120, 892088983395733504, 892090539637690370, 892130630665986049, 892080707388026880, 892089769215307777, 892132135829553152, 892089695630438400, 892131064134676480, 892132213701193729, 892090652279701504, 892089296135573505, 892070351290675200, 892072587802677249, 892089028195082240, 892131590750580738, 892090739894738945]17
49vivegam[892087890196406272, 892069460852449281, 892069080290566144, 892070166942466049, 892130940536868864, 892079387431936000, 892087873872183299, 892068016996798468, 892069253083312128, 892069478787194880, 892069015945748480, 892069331554652160, 892078821406416896, 892079296235290630, 892068402709135362, 892073292651044864, 892068748319838209]17
\n", + "
" + ], + "text/plain": [ + " _tag \\\n", + "0 trump \n", + "1 scaramucci \n", + "2 maga \n", + "3 police \n", + "4 noconfederate \n", + "5 tcot \n", + "6 resist \n", + "7 trumprussia \n", + "8 theresistance \n", + "9 p2 \n", + "10 venezuela \n", + "11 mooch \n", + "12 blacklivesmatter \n", + "13 whitehouse \n", + "14 ripmsando \n", + "15 obamacare \n", + "16 mondaymotivation \n", + "17 usa \n", + "18 potus \n", + "19 brexit \n", + "20 nsfdw2017 \n", + "21 pjnet \n", + "22 fakenews \n", + "23 pdx911 \n", + "24 politics \n", + "25 mersal \n", + "26 protest \n", + "27 trumptrain \n", + "28 impeachtrump \n", + "29 shooting \n", + "30 nowplaying \n", + "31 cnn \n", + "32 iran \n", + "33 bharathanenenu \n", + "34 whitehousechaos \n", + "35 aiimsdocnamestharoor \n", + "36 crime \n", + "37 northkorea \n", + "38 donaldtrump \n", + "39 hdtalk \n", + "40 aca \n", + "41 mtvhottest \n", + "42 comey \n", + "43 russia \n", + "44 kelly \n", + "45 themooch \n", + "46 msnbc \n", + "47 c4news \n", + "48 americafirst \n", + "49 vivegam \n", + "\n", + " post_ids \\\n", + "0 [892130311353491456, 892089069508886528, 892091196536958976, 892089022285324288, 892091257257938945, 892131104123113473, 892131967268933632, 892091382743134208, 892089570493431810, 892132402503483392, 892090418946605057, 892130277237161986, 892090944748752896, 892091096368529410, 892090195515916288, 892131105599614976, 892080208022347776, 892132395532656641, 892130395193356289, 892131881499725824, 892130759691075584, 892089535739424768, 892089998459252737, 892091175976538112, 892131874780393478, 892131652230553600, 892130634101129217, 892132140657332225, 892131326723334146, 892089159350878209, 892132206914752513, 892131672229122051, 892130753018028036, 892090696831836161, 892131915842629632, 892131112885026816, 892130365015339008, 892089751209218048, 892090845263953921, 892130304953122... \n", + "1 [892132262958977024, 892130594272014340, 892130254055239680, 892131699118587904, 892130764002865156, 892131779091591168, 892131881499725824, 892132439467982848, 892131963129270273, 892131668160647168, 892132070742306820, 892131745591635970, 892130597551955968, 892131874780393478, 892131652230553600, 892131715367542786, 892132367644622848, 892132167022596096, 892131672229122051, 892131758199762944, 892130753018028036, 892131915842629632, 892132131476045824, 892130304953122816, 892131699647184896, 892130322510434305, 892130547413037056, 892131402136813573, 892130353967767552, 892090646923796480, 892130730796625920, 892131465131110400, 892130792616452098, 892130898749083648, 892131397791604736, 892130916197388291, 892130312867651585, 892131167604006913, 892132151612801024, 892130721585930... \n", + "2 [892089115877015552, 892086778278105089, 892078956500975619, 892091135782465537, 892081029984530432, 892091312677244928, 892075919497625601, 892089445238923264, 892090702770847744, 892089535739424768, 892130248632004609, 892090049004744704, 892089417237749761, 892090904496017408, 892079467153297408, 892131590750580738, 892089763620159489, 892131112885026816, 892090854483140608, 892090845263953921, 892131043448422400, 892089943136370688, 892091364539760640, 892079812172554240, 892089538692210688, 892131118199312387, 892089447369523204, 892090701487501313, 892088983395733504, 892090730876895233, 892089166728753152, 892079523264471041, 892089773413847041, 892089148626087937, 892130288360275968, 892089104451616770, 892078291208871936, 892077078744576000, 892091528100929536, 892131057943883... \n", + "3 [892076286184640512, 892090571954761728, 892077117839736832, 892070379296096256, 892073181967785984, 892077226241425408, 892080769237233668, 892071978978484224, 892068426864373762, 892080386607546369, 892070868423237633, 892130575074676737, 892073229480849412, 892079998869360646, 892130362809307137, 892079097689624576, 892087419524304897, 892070868309770240, 892070640555094016, 892130738954530818, 892068911935504385, 892077836248416256, 892086803108331524, 892077359615983616, 892068303648116736, 892071904428687360, 892069284414971904, 892074606005178368, 892078796454723584, 892074228488355841, 892072684061945861, 892131299086942209, 892080434972160001, 892090332782907392, 892072110935486464, 892071928600580096, 892077014265495553, 892068646046117888, 892073173742735364, 892132118888951... \n", + "4 [892073427040952321, 892089404931555328, 892079622162186240, 892080277173919744, 892073268408090625, 892078262825910272, 892071862334652416, 892090273936887815, 892072872159707137, 892078736312500224, 892076008601337856, 892077537055997952, 892078196128071684, 892074034661380096, 892071832010063877, 892077880779259905, 892070024311181313, 892077153411514370, 892074414564675584, 892086556617519104, 892071294413541376, 892070671794163712, 892080681710497792, 892078599469137921, 892076690322538496, 892068390986285056, 892079270729846785, 892131451545817088, 892072749451096068, 892073511258386434, 892068302176104457, 892075876816498689, 892087929572610048, 892069391826898944, 892073434938826757, 892077150735659008, 892074095466119168, 892079681159278592, 892079599173091329, 892077250937573... \n", + "5 [892090848577560577, 892091257257938945, 892091503903883264, 892067993739505666, 892131454536339456, 892090848938209280, 892091348584673280, 892072125137375235, 892090354631151618, 892076543119421441, 892090149885968384, 892089173477400576, 892090228864843777, 892132213701193729, 892131846674415617, 892131450094485504, 892089535739424768, 892131312789860353, 892131116026667008, 892090226939760640, 892090236955762688, 892090843242393602, 892090866751418368, 892086542491156482, 892080458510589952, 892131788310560768, 892132443372875778, 892131112885026816, 892090854483140608, 892090617358041090, 892090845263953921, 892089924962406400, 892090446238932993, 892131955738763266, 892089397482594304, 892076531815772160, 892089538692210688, 892131118199312387, 892070986178232321, 892090695091183... \n", + "6 [892089338799902720, 892089460631982080, 892132445495074816, 892090427532341248, 892089526788730880, 892131591274868737, 892132416747425793, 892131640922710016, 892088505719021570, 892074430108663808, 892132228335099905, 892074128076832769, 892131401478402048, 892070817994952704, 892078757921652736, 892077359615983616, 892088993453670400, 892131567044358144, 892089566198419456, 892130609329557505, 892078940612722689, 892132131476045824, 892132086399856643, 892130760093650944, 892090473548058624, 892090250020831232, 892090646923796480, 892131876902768641, 892131769469763584, 892089900790497280, 892131283391967237, 892091165914411008, 892131747323797504, 892090384440066050, 892130322892156929, 892132339668496384, 892130291791179776, 892073605575651328, 892131879868026880, 892091073505382... \n", + "7 [892089325156007937, 892089553489678336, 892088911857676289, 892132103764287488, 892088919948488704, 892089353333112832, 892091276073598976, 892130262347206656, 892090663151554560, 892089485479071744, 892089858231087106, 892131149354586114, 892089516550483968, 892090299362705408, 892130341762347008, 892131423506886656, 892090140339863554, 892089668509908992, 892088891783688192, 892090575096352768, 892089035145060353, 892089566198419456, 892131705070465024, 892131963015909376, 892132116116504581, 892132131476045824, 892132086399856643, 892089446744633344, 892088901522800648, 892091434748321798, 892089617540722689, 892090971856355328, 892090745452204032, 892089136210997252, 892089476603822080, 892088944120270849, 892090384440066050, 892090821914263553, 892089840543694850, 892090712472272... \n", + "8 [892131274264989696, 892130442002006017, 892131062016561152, 892090730876895233, 892089460631982080, 892073605575651328, 892130944370528256, 892068692371976192, 892132103764287488, 892088966152900608, 892080208022347776, 892131665341849600, 892089494169694208, 892132098739503110, 892090159277232133, 892089240577753088, 892090553055285249, 892088978098315269, 892088971945164800, 892130578979373057, 892130627205554177, 892132228335099905, 892091461684035584, 892086860008349697, 892131272054755332, 892132428432773120, 892089969283661825, 892088875404869635, 892088960519987200, 892131798146203648, 892074524048474112, 892089411059433476, 892090007682527232, 892088993453670400, 892089774818721792, 892132116116504581, 892080976481988608, 892130877769003008, 892130331490254848, 892088952492097... \n", + "9 [892089338799902720, 892090418518740996, 892131879868026880, 892130572432154625, 892090427532341248, 892089036554338304, 892091226425401344, 892072125137375235, 892130716636504064, 892091513995378689, 892091425755734016, 892130252591316992, 892132213701193729, 892130273206296576, 892090069175209988, 892131846674415617, 892090153442975745, 892131702566531078, 892090470410715138, 892090049004744704, 892130725046005761, 892090236955762688, 892090180051628036, 892089498548318208, 892130550684696577, 892090739894738945, 892090176801034240, 892130522129985536, 892089984823570433, 892090672194482176, 892071446410809344, 892070628349685760, 892130880080052224, 892090446238932993, 892132283087548417, 892088928437755904, 892089900790497280, 892088959341395968, 892132005235830784, 892090652279701... \n", + "10 [892071382581903361, 892090993348136960, 892073903752974336, 892071666335068160, 892087968294354944, 892090560781180928, 892074709470388225, 892088516464721920, 892076791791312896, 892090587326943233, 892089795429646341, 892090316483952640, 892090605941256192, 892078144873803777, 892089398501801984, 892072450975870977, 892087388809314305, 892073692179685376, 892131081838878720, 892089891630202882, 892071320594378752, 892089848080814081, 892130428043223040, 892068786093903873, 892131776927330304, 892068123775512576, 892087931443093504, 892073716431106048, 892086898004336640, 892131834863202306, 892071199731273730, 892130475212451840, 892068318487695360, 892074988236308481, 892089182885220354, 892072698460950528, 892070397079957506, 892072491472048128, 892071978617667584, 892090740968435... \n", + "11 [892130763541553154, 892131216752865281, 892131938563182594, 892131364400766978, 892131095256477697, 892130459710140416, 892132120398766080, 892131143302221824, 892132054241918976, 892130845280153600, 892132171934203904, 892130915144617984, 892130963584688139, 892130934769799168, 892131477860876288, 892131351431974912, 892131894254485504, 892130839261310977, 892130304374317056, 892130597551955968, 892132262099210241, 892132311185203201, 892131565286948872, 892132190808670208, 892131285727981568, 892131650758463488, 892131866429489157, 892132303031463936, 892090646923796480, 892131847362293763, 892130323236093952, 892130245674913792, 892131230245847040, 892131642705313795, 892131474023034881, 892132416701177856, 892131172557443072, 892131987779076096, 892132431498760193, 892130874426372... \n", + "12 [892075071484813312, 892069987040600065, 892068121594474497, 892072221895798784, 892072469665906688, 892078233541390337, 892086710217187329, 892079202723352576, 892070473864945664, 892080909033365505, 892075021815959556, 892087990171963392, 892073511061147648, 892087121439338497, 892071425544146944, 892069651118604288, 892069481685692417, 892077504810373124, 892089984823570433, 892071292958060544, 892086858443886592, 892074646123577344, 892070377953808384, 892067979793281025, 892074262080745478, 892070369955385344, 892087572842905600, 892068152913350656, 892068091986931713, 892068362276061184, 892069151891521536, 892068065667457024, 892069976215048193, 892090164075532289, 892071064464969729, 892068750601703424, 892079870175567873, 892078913517740033, 892090996892114944, 892130925538050... \n", + "13 [892132305627742208, 892130725495009281, 892132175147147265, 892131387846922241, 892131938563182594, 892131283803021312, 892090828344115200, 892130605600829443, 892091109815590912, 892090409127624704, 892131580449370116, 892132351790264321, 892130319847084034, 892131343798214656, 892131456331522048, 892131257446019072, 892131093553565696, 892131894254485504, 892131319106473984, 892132455905345536, 892131020052582401, 892132070742306820, 892131652230553600, 892130393088036865, 892132097917431808, 892130873289658368, 892131521422905347, 892131967508066304, 892131567044358144, 892130718939308032, 892132026287104002, 892090971944517637, 892131558651564037, 892089277160579072, 892131032836780037, 892132270441734145, 892132148014186496, 892130667210952710, 892130466052145152, 892132227932332... \n", + "14 [892068519550091266, 892088181432233988, 892069410416066561, 892071790519996416, 892068903555407872, 892070176249831424, 892070726869467136, 892071419881893889, 892076501461610497, 892078115601743874, 892073016162758656, 892088436533932032, 892071932153262080, 892078987400409088, 892077312803471360, 892074450308517891, 892088389700395009, 892068969674416131, 892090256518000640, 892075719517458433, 892089653481938944, 892088445920829441, 892069391835299840, 892091498522697729, 892077858465746944, 892079000360824833, 892077993329180672, 892072713245872128, 892090951426076672, 892089779243843585, 892090565990506496, 892086761823797249, 892070015419244545, 892090460273094656, 892087394979241985, 892069700557033474, 892076568255836160, 892069719418839041] \n", + "15 [892131607041130497, 892088935572230144, 892130656326733825, 892089917744058370, 892089194713038852, 892090093283848192, 892130277237161986, 892089211368722433, 892130581332582400, 892132386938523650, 892131805410844673, 892090553055285249, 892088905859727360, 892130308488810496, 892078703185973248, 892130662668546051, 892132090686341120, 892132290503077888, 892131686489653249, 892089969283661825, 892131454406324224, 892089411059433476, 892130502307721216, 892089875658309632, 892091428448473089, 892131757981655040, 892089112408338432, 892130760093650944, 892130353967767552, 892089666085806080, 892089385151340544, 892132329547853824, 892090870262050817, 892090380711329792, 892090271751700481, 892131890551033856, 892130523241340929, 892131867855536128] \n", + "16 [892089447210250240, 892091258923044865, 892130605600829443, 892068319410257920, 892079995719368704, 892089099099799552, 892089105064091650, 892090553055285249, 892131743158939648, 892088964550676481, 892090380195213312, 892088725479596032, 892090690670407680, 892091204854308867, 892131312789860353, 892091087078260736, 892091530323906560, 892088890206568449, 892089296315928578, 892089906444484612, 892089969283661825, 892089643524661248, 892089411059433476, 892130492702523392, 892130718939308032, 892077904972197889, 892078641068331008, 892089860793749506, 892091437822627840, 892089681558425600, 892090792164171776, 892068457210163200, 892070630505533440, 892091501785829377, 892090894547120129, 892089389337260036, 892069059625398272] \n", + "17 [892130311353491456, 892131452288212992, 892072828757049346, 892075966687784961, 892132439778361344, 892090427490398209, 892090352504631297, 892090355839053829, 892090110950477824, 892132352788381696, 892089104074129410, 892091139217649666, 892130303967473664, 892077078744576000, 892070015876423682, 892079921358675970, 892132213701193729, 892131595326337024, 892131781901680640, 892068495780962304, 892131312789860353, 892067981957541888, 892073611640672256, 892131565085405184, 892090739894738945, 892090402634911748, 892068664219807744, 892068961529069568, 892131366149685248, 892090672194482176, 892067972675653634, 892090359760773120, 892130735158702080, 892090652279701504, 892090486755917824] \n", + "18 [892088414887178241, 892090136216731648, 892068215152619521, 892131539710038020, 892089303588888577, 892090151626838019, 892132175147147265, 892090342320865280, 892131387846922241, 892130345709076480, 892131318112296960, 892130303967473664, 892131284918702080, 892088949040177153, 892089573140029440, 892074291658981379, 892091199372353536, 892088894136733699, 892131385506451456, 892132026287104002, 892089107110866946, 892131519216484352, 892091326421909505, 892069864193630208, 892130870655565825, 892131642965229568, 892090148699201537, 892090606163566593, 892091397716791296, 892130874426372097, 892132403950518275, 892131599013343233] \n", + "19 [892078735222091776, 892131669129482240, 892069813471899648, 892070357376651266, 892091078358249475, 892077267895156736, 892069611306450944, 892131474543177728, 892086921719107584, 892069223543033859, 892090332518678528, 892069483443093505, 892090855741370368, 892077262190804997, 892072096553193476, 892071515952435200, 892131502582104064, 892089594883297280, 892089898223759360, 892132384702955520, 892071274943586304, 892090877677621248, 892088880517844993, 892077406302986243, 892078037168205824, 892087776635695104, 892080535031468032, 892089497474789378, 892077066849484800, 892089338363883526, 892091428670771201, 892130335701557248, 892070006992887809] \n", + "20 [892087523849195520, 892087068297494530, 892087068314259458, 892087067915812864, 892087068398170113, 892087109976305665, 892087523769495552, 892087067697713152, 892087068322607105, 892087068293201924, 892087067710279680, 892087067739643904, 892087067504779265, 892087067672444928, 892087523878588416, 892087523706630144, 892087067966156801, 892087068574314497, 892087523719229440, 892087068272283648, 892087067659964416, 892087523668881408, 892087068498767872, 892087067680940033, 892087068758880256, 892087067785777156, 892087068305891329, 892087067609583616, 892087067764809728, 892087067752247296, 892087068347826178, 892087068658200577, 892087067680935936] \n", + "21 [892091257257938945, 892067993739505666, 892132150417510408, 892080227827855360, 892091348584673280, 892130920035106816, 892077078744576000, 892088988353183745, 892090702770847744, 892091020535406592, 892131372592242690, 892130396774821889, 892076209605050368, 892090226939760640, 892131675194314752, 892086542491156482, 892080458510589952, 892130712630906880, 892089415467646979, 892130262796165120, 892090855212945408, 892088450295496709, 892132283087548417, 892073375283032064, 892089754224779264, 892070249872404481, 892088959341395968, 892090075185414144, 892132244722266112, 892131490191912960, 892131511935348737, 892090041845129216] \n", + "22 [892089060428312577, 892130804960305153, 892131434571456513, 892132311248064512, 892089465459556352, 892089313055326208, 892091025761734656, 892088881524260864, 892130958509473794, 892079976882876426, 892074928455004160, 892130820344999936, 892089841189609472, 892091560988463105, 892132006615797762, 892131487465717760, 892090732772810752, 892131540095905792, 892090347702165504, 892131222654267393, 892089120142614532, 892090140746797057, 892072446253293574, 892131338542829568, 892089678127673344, 892088965112705024, 892089023400783872, 892131520709894145, 892089013967896578] \n", + "23 [892075165214953472, 892079688952168449, 892078413401305088, 892079690273378304, 892070103235284992, 892076397262364674, 892078414693150720, 892069605509746688, 892090741110984704, 892077672267890688, 892071115375300608, 892088728780328960, 892088725034852352, 892075166792007680, 892088726481805312, 892089732561215488, 892132258332631041, 892089241173344257, 892070604936888320, 892068601292660736, 892079168564764672, 892075158554296320, 892074142488346624, 892080173234782208, 892072628004962305, 892072629493878784, 892080174547714049] \n", + "24 [892131871550652416, 892087832109559808, 892090352504631297, 892090355839053829, 892089567930494977, 892090623154679811, 892132166808764418, 892132284979044352, 892132140657332225, 892089031671939072, 892090354253647872, 892132418005729280, 892131294565588992, 892090762007117824, 892090269126021121, 892131008304336897, 892131199216345088, 892077420936925188, 892089561014300673, 892073022722514944, 892091023215734784, 892130273218895872, 892131450224607232, 892090270967361537, 892089774634303488] \n", + "25 [892068819388313600, 892069677672693761, 892130496314003458, 892073561556369409, 892076907948265472, 892072919899283456, 892074903469408256, 892077741197082625, 892068249357225986, 892072984059428865, 892074515643080704, 892073132944539649, 892080288150331393, 892077483624980481, 892079466398208001, 892087382081654784, 892087525543600128, 892070393833439233, 892070703217885184, 892069822028107777, 892079124210106368, 892089836479299585, 892087298375827456] \n", + "26 [892075046759538688, 892091358927888384, 892074928513708032, 892091147874697219, 892068539309469697, 892071529009295360, 892071310330933249, 892089366801248256, 892078987526242304, 892076704289636352, 892091277805842433, 892077181601447936, 892068946052100097, 892078749184872448, 892091225758736386, 892070892578197504, 892091014768451585, 892091319513960448, 892087965199069185, 892077222747664385, 892073296967213056, 892086952912191492, 892091074877034497] \n", + "27 [892091335490056196, 892132449852858368, 892089476260016131, 892088947014328320, 892090575096352768, 892131967268933632, 892132211197173760, 892089134466183169, 892090949827833856, 892089201893793795, 892132114757537792, 892091379244912640, 892090215610937344, 892132058390122496, 892090971856355328, 892075949776265217, 892090049004744704, 892130408032157696, 892089499169169408, 892089028195082240, 892131401478402048, 892091136050954242, 892130588030902272] \n", + "28 [892088875404869635, 892131528595124225, 892089342776279042, 892132051519983620, 892089440314826753, 892090007682527232, 892131394264088576, 892131745939816448, 892091346051379205, 892089049665687552, 892089913050517504, 892131612208623621, 892090103610445824, 892089389878198272, 892088953897066497, 892090160447225857, 892087629533065217, 892130830180466688, 892089852430344192, 892076120081911809, 892089187863859201, 892130604254457859] \n", + "29 [892074203297579008, 892077326112092160, 892087513921335296, 892090469773185028, 892075163990106113, 892087327002177536, 892088528208879618, 892068399882223616, 892089189684191232, 892070833950261248, 892089608942612480, 892078395030331393, 892078672730923008, 892091436639731712, 892076011659067392, 892078040737497089, 892073026925318145, 892073216096714756, 892089840807936002, 892075563531345920, 892088892349919234, 892088198091943936] \n", + "30 [892075689846951937, 892074619481579520, 892091531779338240, 892072547361189888, 892071746039422976, 892075830767112192, 892077202539397120, 892068399026753536, 892070015876423682, 892072239914418176, 892069322054656001, 892075145493233664, 892090052771237888, 892078418396884993, 892131999858733056, 892089732515209216, 892068365413568512, 892071036019200000, 892077410220470272, 892132017013436419, 892070558384390145, 892087199939739648] \n", + "31 [892073559677206528, 892089782028845057, 892130291791179776, 892131104123113473, 892090672194482176, 892091212227788800, 892131110360014848, 892132445495074816, 892079097689624576, 892072446253293574, 892130674148163584, 892131283391967237, 892089807328886784, 892089720930541568, 892091303286145025, 892076709813473280, 892090486755917824, 892131112822030336, 892132255048577024, 892089799431012355, 892089640349556736] \n", + "32 [892130311353491456, 892069225472417793, 892068664219807744, 892087993833533440, 892068613942915072, 892132402503483392, 892086791783763969, 892070414343655424, 892071775458164737, 892069569388462080, 892091035928670208, 892075151831031808, 892068946052100097, 892070494467375105, 892130617399410690, 892070018992701440, 892088874272534528, 892090559594209280, 892073812220661760, 892070204179611648] \n", + "33 [892070816690692096, 892069793569816577, 892079262118797312, 892077363743277056, 892070105693040640, 892071613633511424, 892072666068209664, 892078933818064896, 892078142088785923, 892072687224233985, 892070433276612608, 892070058343546880, 892076040293564417, 892074418217680896, 892069142416814080, 892070141889921024, 892073106717556736, 892073166557728769, 892069209269714946, 892074216417239040] \n", + "34 [892131402661212160, 892131680395431940, 892130464814596096, 892131202567512065, 892131225279889408, 892131734002704384, 892089710176333826, 892090107662131200, 892131259761152000, 892131386781519872, 892090231658344450, 892131118295773186, 892089551346421760, 892130555688607744, 892131651169525764, 892132366495342592, 892131052579434496, 892132422392918018] \n", + "35 [892070171786870784, 892079941671686145, 892068300334739456, 892076743866986496, 892089257531129856, 892073526491987968, 892080354189647872, 892068715419676672, 892068642782949376, 892089928204660737, 892075427220639744, 892077976430428162, 892077994214277122, 892078468518756359, 892070580060553216, 892080295733739520, 892068139495600130, 892068723816767488, 892079899783045120] \n", + "36 [892088998696497155, 892091457754025984, 892076411275575296, 892070201033977857, 892071191883784193, 892086899153797121, 892130823008387072, 892075021232783360, 892067972675653634, 892075963806339072, 892090159117852674, 892091454746767360, 892079816220057602, 892087994945077248, 892075552990953473, 892067981957541888, 892087723573620737, 892087374091620361, 892078495945412608, 892074390548074496, 892070460590084096] \n", + "37 [892090962469670912, 892130492702523392, 892130365015339008, 892090004343857154, 892090387275403264, 892087544250220548, 892090445278453760, 892089105064091650, 892070494467375105, 892091263272591360, 892091204854308867, 892090778100523008, 892130975928471556, 892091237343297538, 892131288974602240, 892090642486231044, 892132122013683712, 892089028861939712, 892089860974096384, 892091082984620032] \n", + "38 [892078937622233088, 892090457290833920, 892072828757049346, 892089303588888577, 892091196536958976, 892090156672528384, 892090346326437889, 892132253433769990, 892131283803021312, 892131685134979073, 892131347732541441, 892131881499725824, 892089381426798592, 892131531157757952, 892091397716791296, 892131792626610179, 892089545289871360, 892090346280300544, 892131278283313155, 892088930400686080] \n", + "39 [892089782028845057, 892089031671939072, 892091548258648064, 892131104123113473, 892132418005729280, 892091212227788800, 892131110360014848, 892131122687098880, 892068709820280832, 892131085676457984, 892089861464670208, 892089871145156608, 892089807328886784, 892077707936251904, 892078757485453319, 892091303286145025, 892131112822030336, 892089640349556736, 892089799431012355] \n", + "40 [892130661859024897, 892080473463193603, 892089608061698052, 892089984823570433, 892130572432154625, 892089787103969280, 892130760093650944, 892131190500724736, 892089120985673728, 892130252591316992, 892089473420464128, 892091208511746048, 892090527931400195, 892132192809345027, 892091089292730368, 892089307342684164, 892131686489653249, 892130550684696577] \n", + "41 [892073504379686914, 892075967115558916, 892073710001283077, 892079811300139008, 892073694939549696, 892091020636299264, 892074743117082626, 892077023132360709, 892074662242471940, 892091043021193217, 892091393535086593, 892074813543632896, 892091560648749056, 892074600091324416, 892090763819044864, 892072493393096706, 892071529953079297, 892091452393754624, 892073690854195200] \n", + "42 [892090589839282176, 892131103246614528, 892130283994140672, 892131285727981568, 892131364400766978, 892132256730558464, 892131603643744256, 892130637120974849, 892089379711205378, 892130610596245504, 892090566107910145, 892130803332861954, 892131283391967237, 892132055781449731, 892090349736349696, 892131682928783360, 892130634101129217, 892091506516819968] \n", + "43 [892130563900850178, 892091249292963842, 892131743121133568, 892132463190958082, 892089812076900356, 892130899239763969, 892130332475924480, 892090335328862208, 892088128323956737, 892131610904100864, 892089251407515649, 892091026713739264, 892090309840179202, 892089408400355328, 892132122013683712, 892089501006397445, 892131272054755332, 892087977412890625] \n", + "44 [892130378193846272, 892132007521775621, 892132097703522304, 892132119492698112, 892131387846922241, 892091103687593984, 892131908938862592, 892132081563611136, 892130819569061891, 892132256730558464, 892132395532656641, 892132439467982848, 892131694739939328, 892130253660975104, 892130839261310977, 892131283391967237, 892130789659467778] \n", + "45 [892130732461744129, 892131128785723392, 892130559811387392, 892130510209765376, 892130644985339904, 892131734002704384, 892130877769003008, 892131357392064512, 892132439467982848, 892132396560252930, 892130998598725632, 892131939750162432, 892131894254485504, 892130721585930241, 892130852146208768, 892131745591635970, 892130286712086530, 892131695725387777] \n", + "46 [892130291791179776, 892089795345870848, 892132445495074816, 892090624001933313, 892070377953808384, 892080707388026880, 892131797298855936, 892130681584746497, 892130924984360960, 892079097689624576, 892130674148163584, 892131283391967237, 892090294400937985, 892131384705396736, 892071064464969729, 892132107039916034, 892132255048577024] \n", + "47 [892090872002732034, 892091156670152708, 892091246755315712, 892091347406139392, 892090194601672704, 892091397955878912, 892089020741812224, 892091034687209472, 892089092766388224, 892090424378183681, 892089196231491585, 892090668776083457, 892090951967154176, 892090989011247104, 892089593541140480, 892090553814396928, 892090821104873473] \n", + "48 [892073223319429120, 892088983395733504, 892090539637690370, 892130630665986049, 892080707388026880, 892089769215307777, 892132135829553152, 892089695630438400, 892131064134676480, 892132213701193729, 892090652279701504, 892089296135573505, 892070351290675200, 892072587802677249, 892089028195082240, 892131590750580738, 892090739894738945] \n", + "49 [892087890196406272, 892069460852449281, 892069080290566144, 892070166942466049, 892130940536868864, 892079387431936000, 892087873872183299, 892068016996798468, 892069253083312128, 892069478787194880, 892069015945748480, 892069331554652160, 892078821406416896, 892079296235290630, 892068402709135362, 892073292651044864, 892068748319838209] \n", + "\n", + " cnt_post_ids \n", + "0 739 \n", + "1 335 \n", + "2 284 \n", + "3 171 \n", + "4 87 \n", + "5 85 \n", + "6 72 \n", + "7 65 \n", + "8 54 \n", + "9 49 \n", + "10 48 \n", + "11 47 \n", + "12 43 \n", + "13 41 \n", + "14 40 \n", + "15 39 \n", + "16 38 \n", + "17 35 \n", + "18 33 \n", + "19 33 \n", + "20 33 \n", + "21 32 \n", + "22 29 \n", + "23 27 \n", + "24 25 \n", + "25 24 \n", + "26 24 \n", + "27 23 \n", + "28 22 \n", + "29 22 \n", + "30 22 \n", + "31 22 \n", + "32 22 \n", + "33 21 \n", + "34 21 \n", + "35 21 \n", + "36 21 \n", + "37 20 \n", + "38 20 \n", + "39 19 \n", + "40 19 \n", + "41 19 \n", + "42 18 \n", + "43 18 \n", + "44 18 \n", + "45 18 \n", + "46 17 \n", + "47 17 \n", + "48 17 \n", + "49 17 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get top X hashtags by posts count.\n", + "ds_posts_by_tag = df3.limit(50).toPandas()\n", + "ds_posts_by_tag" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['trump',\n", + " 'scaramucci',\n", + " 'maga',\n", + " 'police',\n", + " 'noconfederate',\n", + " 'tcot',\n", + " 'resist',\n", + " 'trumprussia',\n", + " 'theresistance',\n", + " 'p2',\n", + " 'venezuela',\n", + " 'mooch',\n", + " 'blacklivesmatter',\n", + " 'whitehouse',\n", + " 'ripmsando',\n", + " 'obamacare',\n", + " 'mondaymotivation',\n", + " 'usa',\n", + " 'potus',\n", + " 'brexit',\n", + " 'nsfdw2017',\n", + " 'pjnet',\n", + " 'fakenews',\n", + " 'pdx911',\n", + " 'politics',\n", + " 'mersal',\n", + " 'protest',\n", + " 'trumptrain',\n", + " 'impeachtrump',\n", + " 'shooting',\n", + " 'nowplaying',\n", + " 'cnn',\n", + " 'iran',\n", + " 'bharathanenenu',\n", + " 'whitehousechaos',\n", + " 'aiimsdocnamestharoor',\n", + " 'crime',\n", + " 'northkorea',\n", + " 'donaldtrump',\n", + " 'hdtalk',\n", + " 'aca',\n", + " 'mtvhottest',\n", + " 'comey',\n", + " 'russia',\n", + " 'kelly',\n", + " 'themooch',\n", + " 'msnbc',\n", + " 'c4news',\n", + " 'americafirst',\n", + " 'vivegam']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "[('trump', 'scaramucci'),\n", + " ('trump', 'maga'),\n", + " ('trump', 'police'),\n", + " ('trump', 'noconfederate'),\n", + " ('trump', 'tcot'),\n", + " ('trump', 'resist'),\n", + " ('trump', 'trumprussia'),\n", + " ('trump', 'theresistance'),\n", + " ('trump', 'p2'),\n", + " ('trump', 'venezuela'),\n", + " ('trump', 'mooch'),\n", + " ('trump', 'blacklivesmatter'),\n", + " ('trump', 'whitehouse'),\n", + " ('trump', 'ripmsando'),\n", + " ('trump', 'obamacare'),\n", + " ('trump', 'mondaymotivation'),\n", + " ('trump', 'usa'),\n", + " ('trump', 'potus'),\n", + " ('trump', 'brexit'),\n", + " ('trump', 'nsfdw2017'),\n", + " ('trump', 'pjnet'),\n", + " ('trump', 'fakenews'),\n", + " ('trump', 'pdx911'),\n", + " ('trump', 'politics'),\n", + " ('trump', 'mersal'),\n", + " ('trump', 'protest'),\n", + " ('trump', 'trumptrain'),\n", + " ('trump', 'impeachtrump'),\n", + " ('trump', 'shooting'),\n", + " ('trump', 'nowplaying'),\n", + " ('trump', 'cnn'),\n", + " ('trump', 'iran'),\n", + " ('trump', 'bharathanenenu'),\n", + " ('trump', 'whitehousechaos'),\n", + " ('trump', 'aiimsdocnamestharoor'),\n", + " ('trump', 'crime'),\n", + " ('trump', 'northkorea'),\n", + " ('trump', 'donaldtrump'),\n", + " ('trump', 'hdtalk'),\n", + " ('trump', 'aca'),\n", + " ('trump', 'mtvhottest'),\n", + " ('trump', 'comey'),\n", + " ('trump', 'russia'),\n", + " ('trump', 'kelly'),\n", + " ('trump', 'themooch'),\n", + " ('trump', 'msnbc'),\n", + " ('trump', 'c4news'),\n", + " ('trump', 'americafirst'),\n", + " ('trump', 'vivegam'),\n", + " ('scaramucci', 'maga'),\n", + " ('scaramucci', 'police'),\n", + " ('scaramucci', 'noconfederate'),\n", + " ('scaramucci', 'tcot'),\n", + " ('scaramucci', 'resist'),\n", + " ('scaramucci', 'trumprussia'),\n", + " ('scaramucci', 'theresistance'),\n", + " ('scaramucci', 'p2'),\n", + " ('scaramucci', 'venezuela'),\n", + " ('scaramucci', 'mooch'),\n", + " ('scaramucci', 'blacklivesmatter'),\n", + " ('scaramucci', 'whitehouse'),\n", + " ('scaramucci', 'ripmsando'),\n", + " ('scaramucci', 'obamacare'),\n", + " ('scaramucci', 'mondaymotivation'),\n", + " ('scaramucci', 'usa'),\n", + " ('scaramucci', 'potus'),\n", + " ('scaramucci', 'brexit'),\n", + " ('scaramucci', 'nsfdw2017'),\n", + " ('scaramucci', 'pjnet'),\n", + " ('scaramucci', 'fakenews'),\n", + " ('scaramucci', 'pdx911'),\n", + " ('scaramucci', 'politics'),\n", + " ('scaramucci', 'mersal'),\n", + " ('scaramucci', 'protest'),\n", + " ('scaramucci', 'trumptrain'),\n", + " ('scaramucci', 'impeachtrump'),\n", + " ('scaramucci', 'shooting'),\n", + " ('scaramucci', 'nowplaying'),\n", + " ('scaramucci', 'cnn'),\n", + " ('scaramucci', 'iran'),\n", + " ('scaramucci', 'bharathanenenu'),\n", + " ('scaramucci', 'whitehousechaos'),\n", + " ('scaramucci', 'aiimsdocnamestharoor'),\n", + " ('scaramucci', 'crime'),\n", + " ('scaramucci', 'northkorea'),\n", + " ('scaramucci', 'donaldtrump'),\n", + " ('scaramucci', 'hdtalk'),\n", + " ('scaramucci', 'aca'),\n", + " ('scaramucci', 'mtvhottest'),\n", + " ('scaramucci', 'comey'),\n", + " ('scaramucci', 'russia'),\n", + " ('scaramucci', 'kelly'),\n", + " ('scaramucci', 'themooch'),\n", + " ('scaramucci', 'msnbc'),\n", + " ('scaramucci', 'c4news'),\n", + " ('scaramucci', 'americafirst'),\n", + " ('scaramucci', 'vivegam'),\n", + " ('maga', 'police'),\n", + " ('maga', 'noconfederate'),\n", + " ('maga', 'tcot'),\n", + " ('maga', 'resist'),\n", + " ('maga', 'trumprussia'),\n", + " ('maga', 'theresistance'),\n", + " ('maga', 'p2'),\n", + " ('maga', 'venezuela'),\n", + " ('maga', 'mooch'),\n", + " ('maga', 'blacklivesmatter'),\n", + " ('maga', 'whitehouse'),\n", + " ('maga', 'ripmsando'),\n", + " ('maga', 'obamacare'),\n", + " ('maga', 'mondaymotivation'),\n", + " ('maga', 'usa'),\n", + " ('maga', 'potus'),\n", + " ('maga', 'brexit'),\n", + " ('maga', 'nsfdw2017'),\n", + " ('maga', 'pjnet'),\n", + " ('maga', 'fakenews'),\n", + " ('maga', 'pdx911'),\n", + " ('maga', 'politics'),\n", + " ('maga', 'mersal'),\n", + " ('maga', 'protest'),\n", + " ('maga', 'trumptrain'),\n", + " ('maga', 'impeachtrump'),\n", + " ('maga', 'shooting'),\n", + " ('maga', 'nowplaying'),\n", + " ('maga', 'cnn'),\n", + " ('maga', 'iran'),\n", + " ('maga', 'bharathanenenu'),\n", + " ('maga', 'whitehousechaos'),\n", + " ('maga', 'aiimsdocnamestharoor'),\n", + " ('maga', 'crime'),\n", + " ('maga', 'northkorea'),\n", + " ('maga', 'donaldtrump'),\n", + " ('maga', 'hdtalk'),\n", + " ('maga', 'aca'),\n", + " ('maga', 'mtvhottest'),\n", + " ('maga', 'comey'),\n", + " ('maga', 'russia'),\n", + " ('maga', 'kelly'),\n", + " ('maga', 'themooch'),\n", + " ('maga', 'msnbc'),\n", + " ('maga', 'c4news'),\n", + " ('maga', 'americafirst'),\n", + " ('maga', 'vivegam'),\n", + " ('police', 'noconfederate'),\n", + " ('police', 'tcot'),\n", + " ('police', 'resist'),\n", + " ('police', 'trumprussia'),\n", + " ('police', 'theresistance'),\n", + " ('police', 'p2'),\n", + " ('police', 'venezuela'),\n", + " ('police', 'mooch'),\n", + " ('police', 'blacklivesmatter'),\n", + " ('police', 'whitehouse'),\n", + " ('police', 'ripmsando'),\n", + " ('police', 'obamacare'),\n", + " ('police', 'mondaymotivation'),\n", + " ('police', 'usa'),\n", + " ('police', 'potus'),\n", + " ('police', 'brexit'),\n", + " ('police', 'nsfdw2017'),\n", + " ('police', 'pjnet'),\n", + " ('police', 'fakenews'),\n", + " ('police', 'pdx911'),\n", + " ('police', 'politics'),\n", + " ('police', 'mersal'),\n", + " ('police', 'protest'),\n", + " ('police', 'trumptrain'),\n", + " ('police', 'impeachtrump'),\n", + " ('police', 'shooting'),\n", + " ('police', 'nowplaying'),\n", + " ('police', 'cnn'),\n", + " ('police', 'iran'),\n", + " ('police', 'bharathanenenu'),\n", + " ('police', 'whitehousechaos'),\n", + " ('police', 'aiimsdocnamestharoor'),\n", + " ('police', 'crime'),\n", + " ('police', 'northkorea'),\n", + " ('police', 'donaldtrump'),\n", + " ('police', 'hdtalk'),\n", + " ('police', 'aca'),\n", + " ('police', 'mtvhottest'),\n", + " ('police', 'comey'),\n", + " ('police', 'russia'),\n", + " ('police', 'kelly'),\n", + " ('police', 'themooch'),\n", + " ('police', 'msnbc'),\n", + " ('police', 'c4news'),\n", + " ('police', 'americafirst'),\n", + " ('police', 'vivegam'),\n", + " ('noconfederate', 'tcot'),\n", + " ('noconfederate', 'resist'),\n", + " ('noconfederate', 'trumprussia'),\n", + " ('noconfederate', 'theresistance'),\n", + " ('noconfederate', 'p2'),\n", + " ('noconfederate', 'venezuela'),\n", + " ('noconfederate', 'mooch'),\n", + " ('noconfederate', 'blacklivesmatter'),\n", + " ('noconfederate', 'whitehouse'),\n", + " ('noconfederate', 'ripmsando'),\n", + " ('noconfederate', 'obamacare'),\n", + " ('noconfederate', 'mondaymotivation'),\n", + " ('noconfederate', 'usa'),\n", + " ('noconfederate', 'potus'),\n", + " ('noconfederate', 'brexit'),\n", + " ('noconfederate', 'nsfdw2017'),\n", + " ('noconfederate', 'pjnet'),\n", + " ('noconfederate', 'fakenews'),\n", + " ('noconfederate', 'pdx911'),\n", + " ('noconfederate', 'politics'),\n", + " ('noconfederate', 'mersal'),\n", + " ('noconfederate', 'protest'),\n", + " ('noconfederate', 'trumptrain'),\n", + " ('noconfederate', 'impeachtrump'),\n", + " ('noconfederate', 'shooting'),\n", + " ('noconfederate', 'nowplaying'),\n", + " ('noconfederate', 'cnn'),\n", + " ('noconfederate', 'iran'),\n", + " ('noconfederate', 'bharathanenenu'),\n", + " ('noconfederate', 'whitehousechaos'),\n", + " ('noconfederate', 'aiimsdocnamestharoor'),\n", + " ('noconfederate', 'crime'),\n", + " ('noconfederate', 'northkorea'),\n", + " ('noconfederate', 'donaldtrump'),\n", + " ('noconfederate', 'hdtalk'),\n", + " ('noconfederate', 'aca'),\n", + " ('noconfederate', 'mtvhottest'),\n", + " ('noconfederate', 'comey'),\n", + " ('noconfederate', 'russia'),\n", + " ('noconfederate', 'kelly'),\n", + " ('noconfederate', 'themooch'),\n", + " ('noconfederate', 'msnbc'),\n", + " ('noconfederate', 'c4news'),\n", + " ('noconfederate', 'americafirst'),\n", + " ('noconfederate', 'vivegam'),\n", + " ('tcot', 'resist'),\n", + " ('tcot', 'trumprussia'),\n", + " ('tcot', 'theresistance'),\n", + " ('tcot', 'p2'),\n", + " ('tcot', 'venezuela'),\n", + " ('tcot', 'mooch'),\n", + " ('tcot', 'blacklivesmatter'),\n", + " ('tcot', 'whitehouse'),\n", + " ('tcot', 'ripmsando'),\n", + " ('tcot', 'obamacare'),\n", + " ('tcot', 'mondaymotivation'),\n", + " ('tcot', 'usa'),\n", + " ('tcot', 'potus'),\n", + " ('tcot', 'brexit'),\n", + " ('tcot', 'nsfdw2017'),\n", + " ('tcot', 'pjnet'),\n", + " ('tcot', 'fakenews'),\n", + " ('tcot', 'pdx911'),\n", + " ('tcot', 'politics'),\n", + " ('tcot', 'mersal'),\n", + " ('tcot', 'protest'),\n", + " ('tcot', 'trumptrain'),\n", + " ('tcot', 'impeachtrump'),\n", + " ('tcot', 'shooting'),\n", + " ('tcot', 'nowplaying'),\n", + " ('tcot', 'cnn'),\n", + " ('tcot', 'iran'),\n", + " ('tcot', 'bharathanenenu'),\n", + " ('tcot', 'whitehousechaos'),\n", + " ('tcot', 'aiimsdocnamestharoor'),\n", + " ('tcot', 'crime'),\n", + " ('tcot', 'northkorea'),\n", + " ('tcot', 'donaldtrump'),\n", + " ('tcot', 'hdtalk'),\n", + " ('tcot', 'aca'),\n", + " ('tcot', 'mtvhottest'),\n", + " ('tcot', 'comey'),\n", + " ('tcot', 'russia'),\n", + " ('tcot', 'kelly'),\n", + " ('tcot', 'themooch'),\n", + " ('tcot', 'msnbc'),\n", + " ('tcot', 'c4news'),\n", + " ('tcot', 'americafirst'),\n", + " ('tcot', 'vivegam'),\n", + " ('resist', 'trumprussia'),\n", + " ('resist', 'theresistance'),\n", + " ('resist', 'p2'),\n", + " ('resist', 'venezuela'),\n", + " ('resist', 'mooch'),\n", + " ('resist', 'blacklivesmatter'),\n", + " ('resist', 'whitehouse'),\n", + " ('resist', 'ripmsando'),\n", + " ('resist', 'obamacare'),\n", + " ('resist', 'mondaymotivation'),\n", + " ('resist', 'usa'),\n", + " ('resist', 'potus'),\n", + " ('resist', 'brexit'),\n", + " ('resist', 'nsfdw2017'),\n", + " ('resist', 'pjnet'),\n", + " ('resist', 'fakenews'),\n", + " ('resist', 'pdx911'),\n", + " ('resist', 'politics'),\n", + " ('resist', 'mersal'),\n", + " ('resist', 'protest'),\n", + " ('resist', 'trumptrain'),\n", + " ('resist', 'impeachtrump'),\n", + " ('resist', 'shooting'),\n", + " ('resist', 'nowplaying'),\n", + " ('resist', 'cnn'),\n", + " ('resist', 'iran'),\n", + " ('resist', 'bharathanenenu'),\n", + " ('resist', 'whitehousechaos'),\n", + " ('resist', 'aiimsdocnamestharoor'),\n", + " ('resist', 'crime'),\n", + " ('resist', 'northkorea'),\n", + " ('resist', 'donaldtrump'),\n", + " ('resist', 'hdtalk'),\n", + " ('resist', 'aca'),\n", + " ('resist', 'mtvhottest'),\n", + " ('resist', 'comey'),\n", + " ('resist', 'russia'),\n", + " ('resist', 'kelly'),\n", + " ('resist', 'themooch'),\n", + " ('resist', 'msnbc'),\n", + " ('resist', 'c4news'),\n", + " ('resist', 'americafirst'),\n", + " ('resist', 'vivegam'),\n", + " ('trumprussia', 'theresistance'),\n", + " ('trumprussia', 'p2'),\n", + " ('trumprussia', 'venezuela'),\n", + " ('trumprussia', 'mooch'),\n", + " ('trumprussia', 'blacklivesmatter'),\n", + " ('trumprussia', 'whitehouse'),\n", + " ('trumprussia', 'ripmsando'),\n", + " ('trumprussia', 'obamacare'),\n", + " ('trumprussia', 'mondaymotivation'),\n", + " ('trumprussia', 'usa'),\n", + " ('trumprussia', 'potus'),\n", + " ('trumprussia', 'brexit'),\n", + " ('trumprussia', 'nsfdw2017'),\n", + " ('trumprussia', 'pjnet'),\n", + " ('trumprussia', 'fakenews'),\n", + " ('trumprussia', 'pdx911'),\n", + " ('trumprussia', 'politics'),\n", + " ('trumprussia', 'mersal'),\n", + " ('trumprussia', 'protest'),\n", + " ('trumprussia', 'trumptrain'),\n", + " ('trumprussia', 'impeachtrump'),\n", + " ('trumprussia', 'shooting'),\n", + " ('trumprussia', 'nowplaying'),\n", + " ('trumprussia', 'cnn'),\n", + " ('trumprussia', 'iran'),\n", + " ('trumprussia', 'bharathanenenu'),\n", + " ('trumprussia', 'whitehousechaos'),\n", + " ('trumprussia', 'aiimsdocnamestharoor'),\n", + " ('trumprussia', 'crime'),\n", + " ('trumprussia', 'northkorea'),\n", + " ('trumprussia', 'donaldtrump'),\n", + " ('trumprussia', 'hdtalk'),\n", + " ('trumprussia', 'aca'),\n", + " ('trumprussia', 'mtvhottest'),\n", + " ('trumprussia', 'comey'),\n", + " ('trumprussia', 'russia'),\n", + " ('trumprussia', 'kelly'),\n", + " ('trumprussia', 'themooch'),\n", + " ('trumprussia', 'msnbc'),\n", + " ('trumprussia', 'c4news'),\n", + " ('trumprussia', 'americafirst'),\n", + " ('trumprussia', 'vivegam'),\n", + " ('theresistance', 'p2'),\n", + " ('theresistance', 'venezuela'),\n", + " ('theresistance', 'mooch'),\n", + " ('theresistance', 'blacklivesmatter'),\n", + " ('theresistance', 'whitehouse'),\n", + " ('theresistance', 'ripmsando'),\n", + " ('theresistance', 'obamacare'),\n", + " ('theresistance', 'mondaymotivation'),\n", + " ('theresistance', 'usa'),\n", + " ('theresistance', 'potus'),\n", + " ('theresistance', 'brexit'),\n", + " ('theresistance', 'nsfdw2017'),\n", + " ('theresistance', 'pjnet'),\n", + " ('theresistance', 'fakenews'),\n", + " ('theresistance', 'pdx911'),\n", + " ('theresistance', 'politics'),\n", + " ('theresistance', 'mersal'),\n", + " ('theresistance', 'protest'),\n", + " ('theresistance', 'trumptrain'),\n", + " ('theresistance', 'impeachtrump'),\n", + " ('theresistance', 'shooting'),\n", + " ('theresistance', 'nowplaying'),\n", + " ('theresistance', 'cnn'),\n", + " ('theresistance', 'iran'),\n", + " ('theresistance', 'bharathanenenu'),\n", + " ('theresistance', 'whitehousechaos'),\n", + " ('theresistance', 'aiimsdocnamestharoor'),\n", + " ('theresistance', 'crime'),\n", + " ('theresistance', 'northkorea'),\n", + " ('theresistance', 'donaldtrump'),\n", + " ('theresistance', 'hdtalk'),\n", + " ('theresistance', 'aca'),\n", + " ('theresistance', 'mtvhottest'),\n", + " ('theresistance', 'comey'),\n", + " ('theresistance', 'russia'),\n", + " ('theresistance', 'kelly'),\n", + " ('theresistance', 'themooch'),\n", + " ('theresistance', 'msnbc'),\n", + " ('theresistance', 'c4news'),\n", + " ('theresistance', 'americafirst'),\n", + " ('theresistance', 'vivegam'),\n", + " ('p2', 'venezuela'),\n", + " ('p2', 'mooch'),\n", + " ('p2', 'blacklivesmatter'),\n", + " ('p2', 'whitehouse'),\n", + " ('p2', 'ripmsando'),\n", + " ('p2', 'obamacare'),\n", + " ('p2', 'mondaymotivation'),\n", + " ('p2', 'usa'),\n", + " ('p2', 'potus'),\n", + " ('p2', 'brexit'),\n", + " ('p2', 'nsfdw2017'),\n", + " ('p2', 'pjnet'),\n", + " ('p2', 'fakenews'),\n", + " ('p2', 'pdx911'),\n", + " ('p2', 'politics'),\n", + " ('p2', 'mersal'),\n", + " ('p2', 'protest'),\n", + " ('p2', 'trumptrain'),\n", + " ('p2', 'impeachtrump'),\n", + " ('p2', 'shooting'),\n", + " ('p2', 'nowplaying'),\n", + " ('p2', 'cnn'),\n", + " ('p2', 'iran'),\n", + " ('p2', 'bharathanenenu'),\n", + " ('p2', 'whitehousechaos'),\n", + " ('p2', 'aiimsdocnamestharoor'),\n", + " ('p2', 'crime'),\n", + " ('p2', 'northkorea'),\n", + " ('p2', 'donaldtrump'),\n", + " ('p2', 'hdtalk'),\n", + " ('p2', 'aca'),\n", + " ('p2', 'mtvhottest'),\n", + " ('p2', 'comey'),\n", + " ('p2', 'russia'),\n", + " ('p2', 'kelly'),\n", + " ('p2', 'themooch'),\n", + " ('p2', 'msnbc'),\n", + " ('p2', 'c4news'),\n", + " ('p2', 'americafirst'),\n", + " ('p2', 'vivegam'),\n", + " ('venezuela', 'mooch'),\n", + " ('venezuela', 'blacklivesmatter'),\n", + " ('venezuela', 'whitehouse'),\n", + " ('venezuela', 'ripmsando'),\n", + " ('venezuela', 'obamacare'),\n", + " ('venezuela', 'mondaymotivation'),\n", + " ('venezuela', 'usa'),\n", + " ('venezuela', 'potus'),\n", + " ('venezuela', 'brexit'),\n", + " ('venezuela', 'nsfdw2017'),\n", + " ('venezuela', 'pjnet'),\n", + " ('venezuela', 'fakenews'),\n", + " ('venezuela', 'pdx911'),\n", + " ('venezuela', 'politics'),\n", + " ('venezuela', 'mersal'),\n", + " ('venezuela', 'protest'),\n", + " ('venezuela', 'trumptrain'),\n", + " ('venezuela', 'impeachtrump'),\n", + " ('venezuela', 'shooting'),\n", + " ('venezuela', 'nowplaying'),\n", + " ('venezuela', 'cnn'),\n", + " ('venezuela', 'iran'),\n", + " ('venezuela', 'bharathanenenu'),\n", + " ('venezuela', 'whitehousechaos'),\n", + " ('venezuela', 'aiimsdocnamestharoor'),\n", + " ('venezuela', 'crime'),\n", + " ('venezuela', 'northkorea'),\n", + " ('venezuela', 'donaldtrump'),\n", + " ('venezuela', 'hdtalk'),\n", + " ('venezuela', 'aca'),\n", + " ('venezuela', 'mtvhottest'),\n", + " ('venezuela', 'comey'),\n", + " ('venezuela', 'russia'),\n", + " ('venezuela', 'kelly'),\n", + " ('venezuela', 'themooch'),\n", + " ('venezuela', 'msnbc'),\n", + " ('venezuela', 'c4news'),\n", + " ('venezuela', 'americafirst'),\n", + " ('venezuela', 'vivegam'),\n", + " ('mooch', 'blacklivesmatter'),\n", + " ('mooch', 'whitehouse'),\n", + " ('mooch', 'ripmsando'),\n", + " ('mooch', 'obamacare'),\n", + " ('mooch', 'mondaymotivation'),\n", + " ('mooch', 'usa'),\n", + " ('mooch', 'potus'),\n", + " ('mooch', 'brexit'),\n", + " ('mooch', 'nsfdw2017'),\n", + " ('mooch', 'pjnet'),\n", + " ('mooch', 'fakenews'),\n", + " ('mooch', 'pdx911'),\n", + " ('mooch', 'politics'),\n", + " ('mooch', 'mersal'),\n", + " ('mooch', 'protest'),\n", + " ('mooch', 'trumptrain'),\n", + " ('mooch', 'impeachtrump'),\n", + " ('mooch', 'shooting'),\n", + " ('mooch', 'nowplaying'),\n", + " ('mooch', 'cnn'),\n", + " ('mooch', 'iran'),\n", + " ('mooch', 'bharathanenenu'),\n", + " ('mooch', 'whitehousechaos'),\n", + " ('mooch', 'aiimsdocnamestharoor'),\n", + " ('mooch', 'crime'),\n", + " ('mooch', 'northkorea'),\n", + " ('mooch', 'donaldtrump'),\n", + " ('mooch', 'hdtalk'),\n", + " ('mooch', 'aca'),\n", + " ('mooch', 'mtvhottest'),\n", + " ('mooch', 'comey'),\n", + " ('mooch', 'russia'),\n", + " ('mooch', 'kelly'),\n", + " ('mooch', 'themooch'),\n", + " ('mooch', 'msnbc'),\n", + " ('mooch', 'c4news'),\n", + " ('mooch', 'americafirst'),\n", + " ('mooch', 'vivegam'),\n", + " ('blacklivesmatter', 'whitehouse'),\n", + " ('blacklivesmatter', 'ripmsando'),\n", + " ('blacklivesmatter', 'obamacare'),\n", + " ('blacklivesmatter', 'mondaymotivation'),\n", + " ('blacklivesmatter', 'usa'),\n", + " ('blacklivesmatter', 'potus'),\n", + " ('blacklivesmatter', 'brexit'),\n", + " ('blacklivesmatter', 'nsfdw2017'),\n", + " ('blacklivesmatter', 'pjnet'),\n", + " ('blacklivesmatter', 'fakenews'),\n", + " ('blacklivesmatter', 'pdx911'),\n", + " ('blacklivesmatter', 'politics'),\n", + " ('blacklivesmatter', 'mersal'),\n", + " ('blacklivesmatter', 'protest'),\n", + " ('blacklivesmatter', 'trumptrain'),\n", + " ('blacklivesmatter', 'impeachtrump'),\n", + " ('blacklivesmatter', 'shooting'),\n", + " ('blacklivesmatter', 'nowplaying'),\n", + " ('blacklivesmatter', 'cnn'),\n", + " ('blacklivesmatter', 'iran'),\n", + " ('blacklivesmatter', 'bharathanenenu'),\n", + " ('blacklivesmatter', 'whitehousechaos'),\n", + " ('blacklivesmatter', 'aiimsdocnamestharoor'),\n", + " ('blacklivesmatter', 'crime'),\n", + " ('blacklivesmatter', 'northkorea'),\n", + " ('blacklivesmatter', 'donaldtrump'),\n", + " ('blacklivesmatter', 'hdtalk'),\n", + " ('blacklivesmatter', 'aca'),\n", + " ('blacklivesmatter', 'mtvhottest'),\n", + " ('blacklivesmatter', 'comey'),\n", + " ('blacklivesmatter', 'russia'),\n", + " ('blacklivesmatter', 'kelly'),\n", + " ('blacklivesmatter', 'themooch'),\n", + " ('blacklivesmatter', 'msnbc'),\n", + " ('blacklivesmatter', 'c4news'),\n", + " ('blacklivesmatter', 'americafirst'),\n", + " ('blacklivesmatter', 'vivegam'),\n", + " ('whitehouse', 'ripmsando'),\n", + " ('whitehouse', 'obamacare'),\n", + " ('whitehouse', 'mondaymotivation'),\n", + " ('whitehouse', 'usa'),\n", + " ('whitehouse', 'potus'),\n", + " ('whitehouse', 'brexit'),\n", + " ('whitehouse', 'nsfdw2017'),\n", + " ('whitehouse', 'pjnet'),\n", + " ('whitehouse', 'fakenews'),\n", + " ('whitehouse', 'pdx911'),\n", + " ('whitehouse', 'politics'),\n", + " ('whitehouse', 'mersal'),\n", + " ('whitehouse', 'protest'),\n", + " ('whitehouse', 'trumptrain'),\n", + " ('whitehouse', 'impeachtrump'),\n", + " ('whitehouse', 'shooting'),\n", + " ('whitehouse', 'nowplaying'),\n", + " ('whitehouse', 'cnn'),\n", + " ('whitehouse', 'iran'),\n", + " ('whitehouse', 'bharathanenenu'),\n", + " ('whitehouse', 'whitehousechaos'),\n", + " ('whitehouse', 'aiimsdocnamestharoor'),\n", + " ('whitehouse', 'crime'),\n", + " ('whitehouse', 'northkorea'),\n", + " ('whitehouse', 'donaldtrump'),\n", + " ('whitehouse', 'hdtalk'),\n", + " ('whitehouse', 'aca'),\n", + " ('whitehouse', 'mtvhottest'),\n", + " ('whitehouse', 'comey'),\n", + " ('whitehouse', 'russia'),\n", + " ('whitehouse', 'kelly'),\n", + " ('whitehouse', 'themooch'),\n", + " ('whitehouse', 'msnbc'),\n", + " ('whitehouse', 'c4news'),\n", + " ('whitehouse', 'americafirst'),\n", + " ('whitehouse', 'vivegam'),\n", + " ('ripmsando', 'obamacare'),\n", + " ('ripmsando', 'mondaymotivation'),\n", + " ('ripmsando', 'usa'),\n", + " ('ripmsando', 'potus'),\n", + " ('ripmsando', 'brexit'),\n", + " ('ripmsando', 'nsfdw2017'),\n", + " ('ripmsando', 'pjnet'),\n", + " ('ripmsando', 'fakenews'),\n", + " ('ripmsando', 'pdx911'),\n", + " ('ripmsando', 'politics'),\n", + " ('ripmsando', 'mersal'),\n", + " ('ripmsando', 'protest'),\n", + " ('ripmsando', 'trumptrain'),\n", + " ('ripmsando', 'impeachtrump'),\n", + " ('ripmsando', 'shooting'),\n", + " ('ripmsando', 'nowplaying'),\n", + " ('ripmsando', 'cnn'),\n", + " ('ripmsando', 'iran'),\n", + " ('ripmsando', 'bharathanenenu'),\n", + " ('ripmsando', 'whitehousechaos'),\n", + " ('ripmsando', 'aiimsdocnamestharoor'),\n", + " ('ripmsando', 'crime'),\n", + " ('ripmsando', 'northkorea'),\n", + " ('ripmsando', 'donaldtrump'),\n", + " ('ripmsando', 'hdtalk'),\n", + " ('ripmsando', 'aca'),\n", + " ('ripmsando', 'mtvhottest'),\n", + " ('ripmsando', 'comey'),\n", + " ('ripmsando', 'russia'),\n", + " ('ripmsando', 'kelly'),\n", + " ('ripmsando', 'themooch'),\n", + " ('ripmsando', 'msnbc'),\n", + " ('ripmsando', 'c4news'),\n", + " ('ripmsando', 'americafirst'),\n", + " ('ripmsando', 'vivegam'),\n", + " ('obamacare', 'mondaymotivation'),\n", + " ('obamacare', 'usa'),\n", + " ('obamacare', 'potus'),\n", + " ('obamacare', 'brexit'),\n", + " ('obamacare', 'nsfdw2017'),\n", + " ('obamacare', 'pjnet'),\n", + " ('obamacare', 'fakenews'),\n", + " ('obamacare', 'pdx911'),\n", + " ('obamacare', 'politics'),\n", + " ('obamacare', 'mersal'),\n", + " ('obamacare', 'protest'),\n", + " ('obamacare', 'trumptrain'),\n", + " ('obamacare', 'impeachtrump'),\n", + " ('obamacare', 'shooting'),\n", + " ('obamacare', 'nowplaying'),\n", + " ('obamacare', 'cnn'),\n", + " ('obamacare', 'iran'),\n", + " ('obamacare', 'bharathanenenu'),\n", + " ('obamacare', 'whitehousechaos'),\n", + " ('obamacare', 'aiimsdocnamestharoor'),\n", + " ('obamacare', 'crime'),\n", + " ('obamacare', 'northkorea'),\n", + " ('obamacare', 'donaldtrump'),\n", + " ('obamacare', 'hdtalk'),\n", + " ('obamacare', 'aca'),\n", + " ('obamacare', 'mtvhottest'),\n", + " ('obamacare', 'comey'),\n", + " ('obamacare', 'russia'),\n", + " ('obamacare', 'kelly'),\n", + " ('obamacare', 'themooch'),\n", + " ('obamacare', 'msnbc'),\n", + " ('obamacare', 'c4news'),\n", + " ('obamacare', 'americafirst'),\n", + " ('obamacare', 'vivegam'),\n", + " ('mondaymotivation', 'usa'),\n", + " ('mondaymotivation', 'potus'),\n", + " ('mondaymotivation', 'brexit'),\n", + " ('mondaymotivation', 'nsfdw2017'),\n", + " ('mondaymotivation', 'pjnet'),\n", + " ('mondaymotivation', 'fakenews'),\n", + " ('mondaymotivation', 'pdx911'),\n", + " ('mondaymotivation', 'politics'),\n", + " ('mondaymotivation', 'mersal'),\n", + " ('mondaymotivation', 'protest'),\n", + " ('mondaymotivation', 'trumptrain'),\n", + " ('mondaymotivation', 'impeachtrump'),\n", + " ('mondaymotivation', 'shooting'),\n", + " ('mondaymotivation', 'nowplaying'),\n", + " ('mondaymotivation', 'cnn'),\n", + " ('mondaymotivation', 'iran'),\n", + " ('mondaymotivation', 'bharathanenenu'),\n", + " ('mondaymotivation', 'whitehousechaos'),\n", + " ('mondaymotivation', 'aiimsdocnamestharoor'),\n", + " ('mondaymotivation', 'crime'),\n", + " ('mondaymotivation', 'northkorea'),\n", + " ('mondaymotivation', 'donaldtrump'),\n", + " ('mondaymotivation', 'hdtalk'),\n", + " ('mondaymotivation', 'aca'),\n", + " ('mondaymotivation', 'mtvhottest'),\n", + " ('mondaymotivation', 'comey'),\n", + " ('mondaymotivation', 'russia'),\n", + " ('mondaymotivation', 'kelly'),\n", + " ('mondaymotivation', 'themooch'),\n", + " ('mondaymotivation', 'msnbc'),\n", + " ('mondaymotivation', 'c4news'),\n", + " ('mondaymotivation', 'americafirst'),\n", + " ('mondaymotivation', 'vivegam'),\n", + " ('usa', 'potus'),\n", + " ('usa', 'brexit'),\n", + " ('usa', 'nsfdw2017'),\n", + " ('usa', 'pjnet'),\n", + " ('usa', 'fakenews'),\n", + " ('usa', 'pdx911'),\n", + " ('usa', 'politics'),\n", + " ('usa', 'mersal'),\n", + " ('usa', 'protest'),\n", + " ('usa', 'trumptrain'),\n", + " ('usa', 'impeachtrump'),\n", + " ('usa', 'shooting'),\n", + " ('usa', 'nowplaying'),\n", + " ('usa', 'cnn'),\n", + " ('usa', 'iran'),\n", + " ('usa', 'bharathanenenu'),\n", + " ('usa', 'whitehousechaos'),\n", + " ('usa', 'aiimsdocnamestharoor'),\n", + " ('usa', 'crime'),\n", + " ('usa', 'northkorea'),\n", + " ('usa', 'donaldtrump'),\n", + " ('usa', 'hdtalk'),\n", + " ('usa', 'aca'),\n", + " ('usa', 'mtvhottest'),\n", + " ('usa', 'comey'),\n", + " ('usa', 'russia'),\n", + " ('usa', 'kelly'),\n", + " ('usa', 'themooch'),\n", + " ('usa', 'msnbc'),\n", + " ('usa', 'c4news'),\n", + " ('usa', 'americafirst'),\n", + " ('usa', 'vivegam'),\n", + " ('potus', 'brexit'),\n", + " ('potus', 'nsfdw2017'),\n", + " ('potus', 'pjnet'),\n", + " ('potus', 'fakenews'),\n", + " ('potus', 'pdx911'),\n", + " ('potus', 'politics'),\n", + " ('potus', 'mersal'),\n", + " ('potus', 'protest'),\n", + " ('potus', 'trumptrain'),\n", + " ('potus', 'impeachtrump'),\n", + " ('potus', 'shooting'),\n", + " ('potus', 'nowplaying'),\n", + " ('potus', 'cnn'),\n", + " ('potus', 'iran'),\n", + " ('potus', 'bharathanenenu'),\n", + " ('potus', 'whitehousechaos'),\n", + " ('potus', 'aiimsdocnamestharoor'),\n", + " ('potus', 'crime'),\n", + " ('potus', 'northkorea'),\n", + " ('potus', 'donaldtrump'),\n", + " ('potus', 'hdtalk'),\n", + " ('potus', 'aca'),\n", + " ('potus', 'mtvhottest'),\n", + " ('potus', 'comey'),\n", + " ('potus', 'russia'),\n", + " ('potus', 'kelly'),\n", + " ('potus', 'themooch'),\n", + " ('potus', 'msnbc'),\n", + " ('potus', 'c4news'),\n", + " ('potus', 'americafirst'),\n", + " ('potus', 'vivegam'),\n", + " ('brexit', 'nsfdw2017'),\n", + " ('brexit', 'pjnet'),\n", + " ('brexit', 'fakenews'),\n", + " ('brexit', 'pdx911'),\n", + " ('brexit', 'politics'),\n", + " ('brexit', 'mersal'),\n", + " ('brexit', 'protest'),\n", + " ('brexit', 'trumptrain'),\n", + " ('brexit', 'impeachtrump'),\n", + " ('brexit', 'shooting'),\n", + " ('brexit', 'nowplaying'),\n", + " ('brexit', 'cnn'),\n", + " ('brexit', 'iran'),\n", + " ('brexit', 'bharathanenenu'),\n", + " ('brexit', 'whitehousechaos'),\n", + " ('brexit', 'aiimsdocnamestharoor'),\n", + " ('brexit', 'crime'),\n", + " ('brexit', 'northkorea'),\n", + " ('brexit', 'donaldtrump'),\n", + " ('brexit', 'hdtalk'),\n", + " ('brexit', 'aca'),\n", + " ('brexit', 'mtvhottest'),\n", + " ('brexit', 'comey'),\n", + " ('brexit', 'russia'),\n", + " ('brexit', 'kelly'),\n", + " ('brexit', 'themooch'),\n", + " ('brexit', 'msnbc'),\n", + " ('brexit', 'c4news'),\n", + " ('brexit', 'americafirst'),\n", + " ('brexit', 'vivegam'),\n", + " ('nsfdw2017', 'pjnet'),\n", + " ('nsfdw2017', 'fakenews'),\n", + " ('nsfdw2017', 'pdx911'),\n", + " ('nsfdw2017', 'politics'),\n", + " ('nsfdw2017', 'mersal'),\n", + " ('nsfdw2017', 'protest'),\n", + " ('nsfdw2017', 'trumptrain'),\n", + " ('nsfdw2017', 'impeachtrump'),\n", + " ('nsfdw2017', 'shooting'),\n", + " ('nsfdw2017', 'nowplaying'),\n", + " ('nsfdw2017', 'cnn'),\n", + " ('nsfdw2017', 'iran'),\n", + " ('nsfdw2017', 'bharathanenenu'),\n", + " ('nsfdw2017', 'whitehousechaos'),\n", + " ('nsfdw2017', 'aiimsdocnamestharoor'),\n", + " ('nsfdw2017', 'crime'),\n", + " ('nsfdw2017', 'northkorea'),\n", + " ('nsfdw2017', 'donaldtrump'),\n", + " ('nsfdw2017', 'hdtalk'),\n", + " ('nsfdw2017', 'aca'),\n", + " ('nsfdw2017', 'mtvhottest'),\n", + " ('nsfdw2017', 'comey'),\n", + " ('nsfdw2017', 'russia'),\n", + " ('nsfdw2017', 'kelly'),\n", + " ('nsfdw2017', 'themooch'),\n", + " ('nsfdw2017', 'msnbc'),\n", + " ('nsfdw2017', 'c4news'),\n", + " ('nsfdw2017', 'americafirst'),\n", + " ('nsfdw2017', 'vivegam'),\n", + " ('pjnet', 'fakenews'),\n", + " ('pjnet', 'pdx911'),\n", + " ('pjnet', 'politics'),\n", + " ('pjnet', 'mersal'),\n", + " ('pjnet', 'protest'),\n", + " ('pjnet', 'trumptrain'),\n", + " ('pjnet', 'impeachtrump'),\n", + " ('pjnet', 'shooting'),\n", + " ('pjnet', 'nowplaying'),\n", + " ('pjnet', 'cnn'),\n", + " ('pjnet', 'iran'),\n", + " ('pjnet', 'bharathanenenu'),\n", + " ('pjnet', 'whitehousechaos'),\n", + " ('pjnet', 'aiimsdocnamestharoor'),\n", + " ('pjnet', 'crime'),\n", + " ('pjnet', 'northkorea'),\n", + " ('pjnet', 'donaldtrump'),\n", + " ('pjnet', 'hdtalk'),\n", + " ('pjnet', 'aca'),\n", + " ('pjnet', 'mtvhottest'),\n", + " ('pjnet', 'comey'),\n", + " ('pjnet', 'russia'),\n", + " ('pjnet', 'kelly'),\n", + " ('pjnet', 'themooch'),\n", + " ('pjnet', 'msnbc'),\n", + " ('pjnet', 'c4news'),\n", + " ('pjnet', 'americafirst'),\n", + " ('pjnet', 'vivegam'),\n", + " ('fakenews', 'pdx911'),\n", + " ('fakenews', 'politics'),\n", + " ('fakenews', 'mersal'),\n", + " ('fakenews', 'protest'),\n", + " ('fakenews', 'trumptrain'),\n", + " ('fakenews', 'impeachtrump'),\n", + " ('fakenews', 'shooting'),\n", + " ('fakenews', 'nowplaying'),\n", + " ('fakenews', 'cnn'),\n", + " ('fakenews', 'iran'),\n", + " ('fakenews', 'bharathanenenu'),\n", + " ('fakenews', 'whitehousechaos'),\n", + " ('fakenews', 'aiimsdocnamestharoor'),\n", + " ('fakenews', 'crime'),\n", + " ('fakenews', 'northkorea'),\n", + " ('fakenews', 'donaldtrump'),\n", + " ('fakenews', 'hdtalk'),\n", + " ('fakenews', 'aca'),\n", + " ('fakenews', 'mtvhottest'),\n", + " ('fakenews', 'comey'),\n", + " ('fakenews', 'russia'),\n", + " ('fakenews', 'kelly'),\n", + " ('fakenews', 'themooch'),\n", + " ('fakenews', 'msnbc'),\n", + " ('fakenews', 'c4news'),\n", + " ('fakenews', 'americafirst'),\n", + " ('fakenews', 'vivegam'),\n", + " ('pdx911', 'politics'),\n", + " ('pdx911', 'mersal'),\n", + " ('pdx911', 'protest'),\n", + " ('pdx911', 'trumptrain'),\n", + " ('pdx911', 'impeachtrump'),\n", + " ('pdx911', 'shooting'),\n", + " ('pdx911', 'nowplaying'),\n", + " ('pdx911', 'cnn'),\n", + " ('pdx911', 'iran'),\n", + " ('pdx911', 'bharathanenenu'),\n", + " ('pdx911', 'whitehousechaos'),\n", + " ('pdx911', 'aiimsdocnamestharoor'),\n", + " ('pdx911', 'crime'),\n", + " ('pdx911', 'northkorea'),\n", + " ('pdx911', 'donaldtrump'),\n", + " ('pdx911', 'hdtalk'),\n", + " ('pdx911', 'aca'),\n", + " ('pdx911', 'mtvhottest'),\n", + " ('pdx911', 'comey'),\n", + " ('pdx911', 'russia'),\n", + " ('pdx911', 'kelly'),\n", + " ('pdx911', 'themooch'),\n", + " ('pdx911', 'msnbc'),\n", + " ('pdx911', 'c4news'),\n", + " ('pdx911', 'americafirst'),\n", + " ('pdx911', 'vivegam'),\n", + " ('politics', 'mersal'),\n", + " ('politics', 'protest'),\n", + " ('politics', 'trumptrain'),\n", + " ('politics', 'impeachtrump'),\n", + " ('politics', 'shooting'),\n", + " ('politics', 'nowplaying'),\n", + " ('politics', 'cnn'),\n", + " ('politics', 'iran'),\n", + " ('politics', 'bharathanenenu'),\n", + " ('politics', 'whitehousechaos'),\n", + " ('politics', 'aiimsdocnamestharoor'),\n", + " ('politics', 'crime'),\n", + " ('politics', 'northkorea'),\n", + " ('politics', 'donaldtrump'),\n", + " ('politics', 'hdtalk'),\n", + " ('politics', 'aca'),\n", + " ('politics', 'mtvhottest'),\n", + " ('politics', 'comey'),\n", + " ('politics', 'russia'),\n", + " ('politics', 'kelly'),\n", + " ('politics', 'themooch'),\n", + " ('politics', 'msnbc'),\n", + " ('politics', 'c4news'),\n", + " ('politics', 'americafirst'),\n", + " ('politics', 'vivegam'),\n", + " ('mersal', 'protest'),\n", + " ('mersal', 'trumptrain'),\n", + " ('mersal', 'impeachtrump'),\n", + " ('mersal', 'shooting'),\n", + " ('mersal', 'nowplaying'),\n", + " ('mersal', 'cnn'),\n", + " ('mersal', 'iran'),\n", + " ('mersal', 'bharathanenenu'),\n", + " ('mersal', 'whitehousechaos'),\n", + " ('mersal', 'aiimsdocnamestharoor'),\n", + " ('mersal', 'crime'),\n", + " ('mersal', 'northkorea'),\n", + " ('mersal', 'donaldtrump'),\n", + " ('mersal', 'hdtalk'),\n", + " ('mersal', 'aca'),\n", + " ('mersal', 'mtvhottest'),\n", + " ('mersal', 'comey'),\n", + " ('mersal', 'russia'),\n", + " ('mersal', 'kelly'),\n", + " ('mersal', 'themooch'),\n", + " ('mersal', 'msnbc'),\n", + " ('mersal', 'c4news'),\n", + " ('mersal', 'americafirst'),\n", + " ('mersal', 'vivegam'),\n", + " ('protest', 'trumptrain'),\n", + " ('protest', 'impeachtrump'),\n", + " ('protest', 'shooting'),\n", + " ('protest', 'nowplaying'),\n", + " ('protest', 'cnn'),\n", + " ('protest', 'iran'),\n", + " ('protest', 'bharathanenenu'),\n", + " ('protest', 'whitehousechaos'),\n", + " ('protest', 'aiimsdocnamestharoor'),\n", + " ('protest', 'crime'),\n", + " ('protest', 'northkorea'),\n", + " ('protest', 'donaldtrump'),\n", + " ('protest', 'hdtalk'),\n", + " ('protest', 'aca'),\n", + " ('protest', 'mtvhottest'),\n", + " ('protest', 'comey'),\n", + " ('protest', 'russia'),\n", + " ('protest', 'kelly'),\n", + " ('protest', 'themooch'),\n", + " ('protest', 'msnbc'),\n", + " ('protest', 'c4news'),\n", + " ('protest', 'americafirst'),\n", + " ('protest', 'vivegam'),\n", + " ('trumptrain', 'impeachtrump'),\n", + " ('trumptrain', 'shooting'),\n", + " ('trumptrain', 'nowplaying'),\n", + " ('trumptrain', 'cnn'),\n", + " ('trumptrain', 'iran'),\n", + " ('trumptrain', 'bharathanenenu'),\n", + " ('trumptrain', 'whitehousechaos'),\n", + " ('trumptrain', 'aiimsdocnamestharoor'),\n", + " ('trumptrain', 'crime'),\n", + " ('trumptrain', 'northkorea'),\n", + " ('trumptrain', 'donaldtrump'),\n", + " ('trumptrain', 'hdtalk'),\n", + " ('trumptrain', 'aca'),\n", + " ('trumptrain', 'mtvhottest'),\n", + " ('trumptrain', 'comey'),\n", + " ('trumptrain', 'russia'),\n", + " ('trumptrain', 'kelly'),\n", + " ('trumptrain', 'themooch'),\n", + " ('trumptrain', 'msnbc'),\n", + " ('trumptrain', 'c4news'),\n", + " ('trumptrain', 'americafirst'),\n", + " ('trumptrain', 'vivegam'),\n", + " ('impeachtrump', 'shooting'),\n", + " ('impeachtrump', 'nowplaying'),\n", + " ('impeachtrump', 'cnn'),\n", + " ('impeachtrump', 'iran'),\n", + " ('impeachtrump', 'bharathanenenu'),\n", + " ('impeachtrump', 'whitehousechaos'),\n", + " ...]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create hashtag pairs from previous top hashtags.\n", + "\n", + "labels = list(ds_posts_by_tag.iloc[:,0])\n", + "labels\n", + "top_pairs = list(combinations(labels, 2))\n", + "top_pairs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Calc PMI" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6337" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "[[('cnn', 'hdtalk'), 7.1592643654138159],\n", + " [('cnn', 'msnbc'), 6.734766536885906],\n", + " [('comey', 'kelly'), 5.3721964575011985],\n", + " [('usa', 'americafirst'), 4.9978009427197003],\n", + " [('p2', 'aca'), 4.8449494546363292],\n", + " [('tcot', 'pjnet'), 4.8051558647773041],\n", + " [('politics', 'hdtalk'), 4.7378005969755392],\n", + " [('theresistance', 'impeachtrump'), 4.7371932742804468],\n", + " [('tcot', 'p2'), 4.6929463611912796],\n", + " [('mondaymotivation', 'northkorea'), 4.6831653403986939],\n", + " [('p2', 'americafirst'), 4.5123741155494583],\n", + " [('kelly', 'msnbc'), 4.4546586176931715],\n", + " [('msnbc', 'americafirst'), 4.4546586176931715],\n", + " [('comey', 'msnbc'), 4.3721964575011985],\n", + " [('kelly', 'themooch'), 4.3721964575011985],\n", + " [('potus', 'donaldtrump'), 4.307656205306488],\n", + " [('resist', 'theresistance'), 4.2897342973092254],\n", + " [('whitehousechaos', 'themooch'), 4.2897342973092254],\n", + " [('blacklivesmatter', 'msnbc'), 4.220193364056148],\n", + " [('obamacare', 'aca'), 4.2117317853079523],\n", + " [('p2', 'usa'), 4.2075195340210376],\n", + " [('usa', 'iran'), 4.1783731883615207],\n", + " [('cnn', 'kelly'), 4.1498040361647499],\n", + " [('northkorea', 'russia'), 4.1377312038641749],\n", + " [('usa', 'cnn'), 4.1079838604701227],\n", + " [('usa', 'crime'), 4.1079838604701227],\n", + " [('tcot', 'usa'), 4.0909103471111816],\n", + " [('cnn', 'comey'), 4.0673418759727777],\n", + " [('trumptrain', 'americafirst'), 4.0185595028864975],\n", + " [('iran', 'northkorea'), 3.9857281104191249],\n", + " [('resist', 'msnbc'), 3.9571589582223541],\n", + " [('mooch', 'comey'), 3.9360973426945245],\n", + " [('whitehouse', 'potus'), 3.8569947962969224],\n", + " [('usa', 'politics'), 3.8564450934741585],\n", + " [('protest', 'iran'), 3.7840942492494745],\n", + " [('maga', 'trumptrain'), 3.7738259137956476],\n", + " [('obamacare', 'mondaymotivation'), 3.7571659218424709],\n", + " [('maga', 'americafirst'), 3.7244982014320791],\n", + " [('tcot', 'americafirst'), 3.717693023526965],\n", + " [('resist', 'p2'), 3.6523043766939334],\n", + " [('resist', 'cnn'), 3.6523043766939334],\n", + " [('p2', 'cnn'), 3.6225570332998815],\n", + " [('potus', 'kelly'), 3.5421214589435102],\n", + " [('fakenews', 'cnn'), 3.3792858822875171],\n", + " [('resist', 'trumprussia'), 3.3362768832462555],\n", + " [('trumprussia', 'cnn'), 3.3065295398522032],\n", + " [('maga', 'tcot'), 3.2505670130996669],\n", + " [('theresistance', 'mondaymotivation'), 3.2502059331225879],\n", + " [('trumprussia', 'impeachtrump'), 3.2394153439936662],\n", + " [('theresistance', 'obamacare'), 3.2117317853079519],\n", + " [('maga', 'pjnet'), 3.1904726659361486],\n", + " [('whitehouse', 'kelly'), 3.1845694543254268],\n", + " [('usa', 'donaldtrump'), 3.1783731883615212],\n", + " [('trumprussia', 'trumptrain'), 3.1752850065739509],\n", + " [('blacklivesmatter', 'aca'), 3.1377312038641749],\n", + " [('whitehouse', 'themooch'), 3.1021072941334538],\n", + " [('whitehouse', 'mondaymotivation'), 3.0625789299468162],\n", + " [('usa', 'nowplaying'), 3.0408696646115865],\n", + " [('mooch', 'kelly'), 3.0185595028864975],\n", + " [('p2', 'pjnet'), 3.0148744560786418],\n", + " [('resist', 'impeachtrump'), 3.0002276801142402],\n", + " [('venezuela', 'russia'), 3.0002276801142402],\n", + " [('whitehouse', 'northkorea'), 2.9501042006884037],\n", + " [('whitehouse', 'donaldtrump'), 2.9501042006884037],\n", + " [('trumprussia', 'theresistance'), 2.9439594604674952],\n", + " [('resist', 'protest'), 2.9360973426945245],\n", + " [('mooch', 'themooch'), 2.9360973426945245],\n", + " [('venezuela', 'northkorea'), 2.8482245866691902],\n", + " [('tcot', 'cnn'), 2.8278759412773877],\n", + " [('theresistance', 'msnbc'), 2.7872339567800419],\n", + " [('mooch', 'whitehouse'), 2.7484703395187533],\n", + " [('scaramucci', 'themooch'), 2.7092314447787689],\n", + " [('theresistance', 'russia'), 2.7047717965880689],\n", + " [('theresistance', 'themooch'), 2.7047717965880689],\n", + " [('trump', 'whitehouse'), 2.6162856947509008],\n", + " [('trumprussia', 'usa'), 2.5695639456859971],\n", + " [('trump', 'comey'), 2.5308942035202566],\n", + " [('trumprussia', 'whitehousechaos'), 2.5289219611886513],\n", + " [('trumprussia', 'russia'), 2.5289219611886513],\n", + " [('usa', 'potus'), 2.5003012832488833],\n", + " [('usa', 'pjnet'), 2.5003012832488833],\n", + " [('p2', 'trumptrain'), 2.4913125000216287],\n", + " [('theresistance', 'cnn'), 2.4823793752516208],\n", + " [('scaramucci', 'mooch'), 2.4710717075840045],\n", + " [('police', 'crime'), 2.4213499418540616],\n", + " [('trumprussia', 'northkorea'), 2.3769188677436013],\n", + " [('resist', 'kelly'), 2.372196457501198],\n", + " [('trump', 'cnn'), 2.3085017821838085],\n", + " [('mondaymotivation', 'usa'), 2.2908479176199337],\n", + " [('resist', 'whitehousechaos'), 2.2897342973092254],\n", + " [('resist', 'aca'), 2.2897342973092254],\n", + " [('resist', 'comey'), 2.2897342973092254],\n", + " [('resist', 'russia'), 2.2897342973092254],\n", + " [('trump', 'russia'), 2.2678597976864627],\n", + " [('scaramucci', 'kelly'), 2.2067311042495854],\n", + " [('scaramucci', 'whitehouse'), 2.1065669423241538],\n", + " [('mooch', 'potus'), 2.1060223441368371],\n", + " [('trumprussia', 'p2'), 2.0841371185157551],\n", + " [('tcot', 'russia'), 2.0502683626138358],\n", + " [('trump', 'obamacare'), 2.0378541922401392],\n", + " [('trump', 'hdtalk'), 2.0378541922401392],\n", + " [('maga', 'c4news'), 1.987532607265873],\n", + " [('resist', 'trumptrain'), 1.9360973426945245],\n", + " [('trump', 'potus'), 1.9232116262990167],\n", + " [('maga', 'themooch'), 1.9050704470739002],\n", + " [('tcot', 'trumprussia'), 1.874418527214418],\n", + " [('trump', 'trumptrain'), 1.762219749626712],\n", + " [('p2', 'blacklivesmatter'), 1.6929463611912794],\n", + " [('maga', 'p2'), 1.6826780257374521],\n", + " [('trumprussia', 'brexit'), 1.6544528432725101],\n", + " [('trump', 'kelly'), 1.6133563637122292],\n", + " [('resist', 'fakenews'), 1.6016783036239652],\n", + " [('tcot', 'politics'), 1.5763371742814234],\n", + " [('trump', 'aca'), 1.5308942035202564],\n", + " [('trump', 'themooch'), 1.5308942035202564],\n", + " [('trump', 'tcot'), 1.4613532702671794],\n", + " [('trump', 'northkorea'), 1.3788911100752064],\n", + " [('maga', 'usa'), 1.3607499308500897],\n", + " [('trump', 'maga'), 1.3513447831895435],\n", + " [('theresistance', 'mooch'), 1.3511348419733684],\n", + " [('maga', 'theresistance'), 1.3201079463527439],\n", + " [('scaramucci', 'potus'), 1.2941939454999249],\n", + " [('resist', 'obamacare'), 1.2117317853079521],\n", + " [('trump', 'mooch'), 1.1772572489055557],\n", + " [('police', 'msnbc'), 1.1412420226613262],\n", + " [('scaramucci', 'comey'), 1.1242689440576128],\n", + " [('trump', 'donaldtrump'), 1.1158567042414125],\n", + " [('trump', 'scaramucci'), 1.1023944444348501],\n", + " [('resist', 'whitehouse'), 1.102107294133454],\n", + " [('police', 'usa'), 1.099421846966699],\n", + " [('trump', 'politics'), 1.056963015187844],\n", + " [('tcot', 'resist'), 1.0502683626138358],\n", + " [('tcot', 'mondaymotivation'), 1.0107399984271985],\n", + " [('trump', 'usa'), 0.98657368729644601],\n", + " [('scaramucci', 'donaldtrump'), 0.97226585061256254],\n", + " [('resist', 'mooch'), 0.93609734269452449],\n", + " [('police', 'donaldtrump'), 0.90677676902430315],\n", + " [('maga', 'resist'), 0.90507044707390016],\n", + " [('maga', 'mondaymotivation'), 0.86554208288726264],\n", + " [('police', 'cnn'), 0.83638744113290531],\n", + " [('trump', 'theresistance'), 0.82040082071524112],\n", + " [('police', 'venezuela'), 0.76927324527436813],\n", + " [('police', 'shooting'), 0.76927324527436813],\n", + " [('trump', 'americafirst'), 0.61335636371222924],\n", + " [('maga', 'trumprussia'), 0.55929561023217],\n", + " [('trump', 'trumprussia'), 0.50704746156588865],\n", + " [('tcot', 'theresistance'), 0.46530586189267942],\n", + " [('scaramucci', 'fakenews'), 0.4362129503723528],\n", + " [('trump', 'resist'), 0.40536332143639753],\n", + " [('maga', 'msnbc'), 0.40257010654471687],\n", + " [('trump', 'iran'), 0.37889111007520643],\n", + " [('maga', 'russia'), 0.32010794635274392],\n", + " [('maga', 'obamacare'), 0.24210543435147072],\n", + " [('police', 'potus'), 0.22870486391166553],\n", + " [('police', 'pjnet'), 0.22870486391166553],\n", + " [('scaramucci', 'americafirst'), 0.20673110424958552],\n", + " [('maga', 'northkorea'), 0.16810485290769411],\n", + " [('maga', 'whitehouse'), 0.13248094317697268],\n", + " [('scaramucci', 'resist'), 0.12426894405761273],\n", + " [('scaramucci', 'whitehousechaos'), 0.12426894405761273],\n", + " [('scaramucci', 'russia'), 0.12426894405761273],\n", + " [('trump', 'brexit'), 0.071462584882959118],\n", + " [('trump', 'msnbc'), 0.028393862991073254],\n", + " [('trump', 'c4news'), 0.028393862991073254],\n", + " [('maga', 'mooch'), -0.033529008261956643],\n", + " [('trump', 'whitehousechaos'), -0.054068297200899867],\n", + " [('police', 'blacklivesmatter'), -0.093223230975696847],\n", + " [('maga', 'fakenews'), -0.36794804733251579],\n", + " [('scaramucci', 'theresistance'), -0.46069355666354372],\n", + " [('trump', 'mondaymotivation'), -0.50863416066638112],\n", + " [('maga', 'potus'), -0.50996705220494354],\n", + " [('trump', 'venezuela'), -0.75861241367472854],\n", + " [('scaramucci', 'usa'), -0.83508907144504141],\n", + " [('trump', 'pjnet'), -0.88414329575858741],\n", + " [('police', 'resist'), -0.94122013753064682],\n", + " [('scaramucci', 'obamacare'), -0.95373356794366049],\n", + " [('trump', 'impeachtrump'), -1.3435749143958846],\n", + " [('scaramucci', 'maga'), -1.5234293120115061],\n", + " [('scaramucci', 'trumprussia'), -1.6365433920629613],\n", + " [('trump', 'fakenews'), -1.7421242908861596],\n", + " [('trump', 'p2'), -2.4988531398737956],\n", + " [('trump', 'police'), -2.7000602313196156],\n", + " [('maga', 'police'), -2.9108464884871279],\n", + " [('trump', 'noconfederate'), -3.2591827271055123]]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# calc pmi for each pair using top x hashtags-posts\n", + "\n", + "posts_count = df.count()\n", + "posts_count\n", + "\n", + "pmi_parts = []\n", + " \n", + "for p in top_pairs:\n", + " a=ds_posts_by_tag.loc[ds_posts_by_tag['_tag'] == p[0]]\n", + " b=ds_posts_by_tag.loc[ds_posts_by_tag['_tag'] == p[1]]\n", + " post_ids_0 = list(a.post_ids)[0]\n", + " post_ids_1 = list(b.post_ids)[0]\n", + " \n", + " # [tag pair, count both, count a, count b]\n", + " pmi_parts.append([p, len(set(post_ids_0) & set(post_ids_1)), len(post_ids_0), len(post_ids_1), posts_count])\n", + "\n", + "# pmi_parts\n", + "\n", + "pmi_scores = []\n", + "\n", + "for p in pmi_parts:\n", + " if not(p[1] and p[2] and p[3]):\n", + " continue\n", + " score = np.log2(( p[1]*posts_count)/ (p[2]*p[3]))\n", + " pmi_scores.append([p[0], score])\n", + " \n", + "sorted_scores = sorted(pmi_scores, key= lambda x: x[1], reverse=1)\n", + "sorted_scores" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Viz threshold for PMI" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 3., 1., 4., 7., 14., 19., 25., 23., 26., 23., 26.,\n", + " 10., 1., 0., 2.]),\n", + " array([-3.25918273, -2.56461959, -1.87005645, -1.17549331, -0.48093017,\n", + " 0.21363297, 0.90819611, 1.60275925, 2.29732239, 2.99188553,\n", + " 3.68644867, 4.38101181, 5.07557495, 5.77013809, 6.46470123,\n", + " 7.15926437]),\n", + " )" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA6IAAAKvCAYAAABwEXLtAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHk9JREFUeJzt3V2IlOcZ8PFrdwcSp35k3bGG3erB5oMgaIMollAxidNQ\nkhA8CCGGGMQDsRZKkkWwobEUKyy0g0FQPBFL2pN40FoKhcAmRCEpxCZNW0wrNZjSkoAZv7Lpastm\nfQ/k9X0TNbs7O3s968zvd+TMPjtz3cyNO3+eZ2c7Ll++fDkAAAAgSWfRAwAAANBehCgAAACphCgA\nAACphCgAAACphCgAAACphCgAAACphCgAAACphCgAAACphCgAAACphCgAAACpStlP+NFHH2U/JTNc\npVKJer1e9Bi0GfuOIth3FMG+owj2Xfvq7e2d0HHOiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIA\nAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBK\niAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIA\nAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJBKiAIAAJCqVPQAAMD0qdXmFD3CpAwMDBc9\nwozQzNetXO6KkZHp3QdeN2CynBEFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAF\nAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAg\nlRAFAAAglRAFAAAglRAFAAAgVWm8A+r1euzduzfOnz8fHR0dUa1W4+GHH45Dhw7Fa6+9FnPnzo2I\niPXr18fy5cunfWAAAABubuOGaFdXV2zYsCH6+/vj4sWLsX379li2bFlERDzyyCPx2GOPTfuQAAAA\ntI5xQ7S7uzu6u7sjImLWrFnR19cXZ8+enfbBAAAAaE3jhuj/7/Tp03Hq1Km488474+9//3u8+uqr\ncfTo0ejv749nnnkmZs+efc33DA0NxdDQUEREDA4ORqVSac7ktIxSqWRfkM6+a8zOnV1FjzBhL774\nedEjXKOIfVcu3zyvWUREpXJL0SPMCM183To7O6NcLjft8a7H68aX+TnLeCYcopcuXYparRYbN26M\ncrkcDz30UDz++OMREfHKK6/Eyy+/HFu3br3m+6rValSr1au36/V6E8amlVQqFfuCdPZdY0ZG5hQ9\nwoTV68NFj3CNIvbdzfSaRczM160IzXzdyuVyjIyMNO3xrsfrxpf5Odu+ent7J3TchD41d3R0NGq1\nWqxevTpWrVoVERG33XZbdHZ2RmdnZ6xduzY++OCDxqcFAACgbYwbopcvX479+/dHX19fPProo1fv\nP3fu3NV/v/3227Fo0aLpmRAAAICWMu6luSdOnIijR4/G4sWLY9u2bRFx5U+1vPnmm/Hhhx9GR0dH\nLFiwIDZv3jztwwIAAHDzGzdE77nnnjh06NA19/uboQAAADRiQr8jCgAAAM0iRAEAAEglRAEAAEgl\nRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEA\nAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEgl\nRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEA\nAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEglRAEAAEgl\nRAEAAEhVKnoAACharTYn5XnK5a4YGZn6cw0MDDdhmptf1uvWLF43gP/HGVEAAABSCVEAAABSCVEA\nAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABS\nCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEA\nAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABS\nCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEA\nAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABS\nCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEA\nAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSCVEAAABSlcY7oF6vx969e+P8\n+fPR0dER1Wo1Hn744fjss89i9+7d8cknn8SCBQviueeei9mzZ2fMDAAAwE1s3BDt6uqKDRs2RH9/\nf1y8eDG2b98ey5YtizfeeCOWLl0a69ati8OHD8fhw4fj6aefzpgZAACAm9i4l+Z2d3dHf39/RETM\nmjUr+vr64uzZs3Hs2LFYs2ZNRESsWbMmjh07Nr2TAgAA0BIm9Tuip0+fjlOnTsWdd94ZFy5ciO7u\n7oi4EquffvrptAwIAABAaxn30tz/69KlS1Gr1WLjxo1RLpcn/ARDQ0MxNDQUERGDg4NRqVQmPyUt\nrVQq2Reks+8aUy53FT3ChFUqt0z42Kx1dXZ2Tupn6I3MxLU1i7Vd0cy1NWvffZXJrI324Ocs45lQ\niI6OjkatVovVq1fHqlWrIiJi3rx5ce7cueju7o5z587F3Llzr/u91Wo1qtXq1dv1er0JY9NKKpWK\nfUE6+64xIyNzih5hwur14Qkfm7WucrkcIyMjU36cmbi2ZrG2K5q5tmbtu68ymbXRHvycbV+9vb0T\nOm7cS3MvX74c+/fvj76+vnj00Uev3r9ixYo4cuRIREQcOXIkVq5c2eCoAAAAtJNxz4ieOHEijh49\nGosXL45t27ZFRMT69etj3bp1sXv37nj99dejUqnE888/P+3DAgAAcPMbN0TvueeeOHTo0HW/tmPH\njqYPBAAAQGub1KfmAgAAwFQJUQAAAFIJUQAAAFIJUQAAAFIJUQAAAFIJUQAAAFIJUQAAAFIJUQAA\nAFIJUQAAAFIJUQAAAFIJUQAAAFIJUQAAAFIJUQAAAFIJUQAAAFIJUQAAAFIJUQAAAFIJUQAAAFIJ\nUQAAAFIJUQAAAFKVih4AoNXUanOKHmFSBgaGix4BAGgzzogCAACQSogCAACQSogCAACQSogCAACQ\nSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogC\nAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQ\nSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogC\nAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQ\nSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogC\nAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQ\nSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogC\nAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQ\nSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogC\nAACQSogCAACQqjTeAfv27Yt333035s2bF7VaLSIiDh06FK+99lrMnTs3IiLWr18fy5cvn95JAQAA\naAnjhuj9998f3/3ud2Pv3r1fuP+RRx6Jxx57bNoGAwAAoDWNe2nukiVLYvbs2RmzAAAA0AbGPSN6\nI6+++mocPXo0+vv745lnnrlhrA4NDcXQ0FBERAwODkalUmn0KWlRpVLJviDddO67crlrWh53ulQq\nt0z42JtpbTNxXZ2dnVEul6f8ODNxbc1ibVc0c23N2ndfZTJroz14f8d4GgrRhx56KB5//PGIiHjl\nlVfi5Zdfjq1bt1732Gq1GtVq9erter3eyFPSwiqVin1BuuncdyMjc6blcadLvT484WNvprXNxHWV\ny+UYGRmZ8uPMxLU1i7Vd0cy1NWvffZXJrI324P1d++rt7Z3QcQ19au5tt90WnZ2d0dnZGWvXro0P\nPvigkYcBAACgDTUUoufOnbv677fffjsWLVrUtIEAAABobeNemvvSSy/F+++/H8PDw7Fly5Z44okn\n4vjx4/Hhhx9GR0dHLFiwIDZv3pwxKwAAAC1g3BB99tlnr7nvwQcfnJZhAAAAaH0NXZoLAAAAjRKi\nAAAApBKiAAAApBKiAAAApBKiAAAApBKiAAAApBKiAAAApBKiAAAApBKiAAAApBKiAAAApBKiAAAA\npBKiAAAApBKiAAAApBKiAAAApBKiAAAApBKiAAAApBKiAAAApBKiAAAApCoVPQDQnmq1OYU+f7nc\nFSMjE59hYGB4GqcBAGgvzogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQ\nSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogC\nAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQ\nSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogC\nAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQ\nSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogC\nAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQ\nSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogC\nAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQ\nSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQSogCAACQqjTeAfv27Yt3\n33035s2bF7VaLSIiPvvss9i9e3d88sknsWDBgnjuuedi9uzZ0z4sAAAAN79xz4jef//98cILL3zh\nvsOHD8fSpUtjz549sXTp0jh8+PC0DQgAAEBrGTdElyxZcs3ZzmPHjsWaNWsiImLNmjVx7Nix6ZkO\nAACAltPQ74heuHAhuru7IyKiu7s7Pv3006YOBQAAQOsa93dEp2poaCiGhoYiImJwcDAqlcp0PyU3\nmVKpZF+0oXK5q9Dn7+zsjHK5POHjK5VbJnxs0WubrFZd20xc12T33Y3MxLU1i7Vd0cy1NWvffZXJ\nrG3nzpvrdXvxxc+LHuGm5P0d42koROfNmxfnzp2L7u7uOHfuXMydO/eGx1ar1ahWq1dv1+v1Rp6S\nFlapVOyLNjQyMqfQ5y+XyzEyMjLh4+v14QkfW/TaJqtV1zYT1zXZfXcjM3FtzWJtVzRzbc3ad1/F\n68aXeX/Xvnp7eyd0XEOX5q5YsSKOHDkSERFHjhyJlStXNvIwAAAAtKFxz4i+9NJL8f7778fw8HBs\n2bIlnnjiiVi3bl3s3r07Xn/99ahUKvH8889nzAoAAEALGDdEn3322evev2PHjqYPAwAAQOtr6NJc\nAAAAaJQQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAA\nIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQ\nBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAA\nIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQ\nBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAA\nIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQ\nBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAA\nIFWp6AGAG6vV5hQ9wqQMDAwXPQIAADcBZ0QBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQB\nAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABI\nJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQB\nAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABI\nJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIVZrK\nN3//+9+PW2+9NTo7O6OrqysGBwebNRcAAAAtakohGhHx4x//OObOnduMWQAAAGgDLs0FAAAg1ZTP\niO7atSsiIr7zne9EtVq95utDQ0MxNDQUERGDg4NRqVSm+pS0mFKpZF/cQLncVfQIk1Kp3DLhY4te\nW2dnZ5TL5QkffzOtbbJadW0zcV2T3Xc3MhPX1izWdkUz19asffdVvG58mfd3jGdKIbpz586YP39+\nXLhwIX76059Gb29vLFmy5AvHVKvVLwRqvV6fylPSgiqVin1xAyMjc4oeYVLq9eEJH1v02srlcoyM\njEz4+JtpbZPVqmubieua7L67kZm4tmaxtiuaubZm7buv4nXjy7y/a1+9vb0TOm5Kl+bOnz8/IiLm\nzZsXK1eujJMnT07l4QAAAGgDDYfopUuX4uLFi1f//Ze//CUWL17ctMEAAABoTQ1fmnvhwoX4+c9/\nHhERn3/+eXz729+Oe++9t2mDAQAA0JoaDtGFCxfGz372s2bOAgAAQBvw51sAAABIJUQBAABIJUQB\nAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABI\nJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIJUQBAABIVSp6AJiqWm1O0SNMysDAcNEjAABA\noZwRBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQ\nBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAA\nIJUQBQAAIJUQBQAAIJUQBQAAIJUQBQAAIFWp6AFmmlptTtEjTNjAwHDRIwAAAEyaM6IAAACkEqIA\nAACkEqIAAACkEqIAAACkEqIAAACkEqIAAACkEqIAAACkEqIAAACkEqIAAACkEqIAAACkEqIAAACk\nEqIAAACkEqIAAACkEqIAAACkEqIAAACkEqIAAACkEqIAAACkEqIAAACkEqIAAACkEqIAAACkEqIA\nAACkEqIAAACkEqIAAACkKhU9AAAAQBFqtTlFjzApAwPDRY/QNM6IAgAAkEqIAgAAkEqIAgAAkEqI\nAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAA\nkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqIAgAAkEqI\nAgAAkKpU9ADkqNXmFD3CDZXLXTEy8sX5BgaGC5oGAKAxM/n91pd5r0XRnBEFAAAglRAFAAAglRAF\nAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAg\nlRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAglRAFAAAgVWkq3/zee+/FwYMHY2xs\nLNauXRvr1q1r1lwAAAC0qIbPiI6NjcWBAwfihRdeiN27d8ebb74Z//73v5s5GwAAAC2o4RA9efJk\n3H777bFw4cIolUpx3333xbFjx5o5GwAAAC2o4RA9e/Zs9PT0XL3d09MTZ8+ebcpQAAAAtK6Oy5cv\nX27kG//whz/En//859iyZUtERBw9ejROnjwZmzZt+sJxQ0NDMTQ0FBERg4ODUxwXAACAm13DZ0R7\nenrizJkzV2+fOXMmuru7rzmuWq3G4OCgCOWGtm/fXvQItCH7jiLYdxTBvqMI9h3jaThE77jjjvj4\n44/j9OnTMTo6Gm+99VasWLGimbMBAADQghr+8y1dXV2xadOm2LVrV4yNjcUDDzwQixYtauZsAAAA\ntKAp/R3R5cuXx/Lly5s1C22qWq0WPQJtyL6jCPYdRbDvKIJ9x3ga/rAiAAAAaETDvyMKAAAAjZjS\npbnQLL/85S/jnXfeiVKpFAsXLoytW7fG1772taLHokW99957cfDgwRgbG4u1a9fGunXrih6JFlev\n12Pv3r1x/vz56OjoiGq1Gg8//HDRY9EmxsbGYvv27TF//nyfZEqK//znP7F///7417/+FR0dHfG9\n730v7r777qLHYoYRoswIy5Yti6eeeiq6urriV7/6VfzmN7+Jp59+uuixaEFjY2Nx4MCB+NGPfhQ9\nPT3xwx/+MFasWBHf+MY3ih6NFtbV1RUbNmyI/v7+uHjxYmzfvj2WLVtm35Hi97//ffT19cXFixeL\nHoU2cfDgwbj33ntjYGAgRkdH47///W/RIzEDuTSXGeGb3/xmdHV1RUTE3XffHWfPni14IlrVyZMn\n4/bbb4+FCxdGqVSK++67L44dO1b0WLS47u7u6O/vj4iIWbNmRV9fn//nSHHmzJl49913Y+3atUWP\nQpsYGRmJv/3tb/Hggw9GRESpVHKVG9fljCgzzuuvvx733Xdf0WPQos6ePRs9PT1Xb/f09MQ//vGP\nAiei3Zw+fTpOnToVd955Z9Gj0AZ+8YtfxNNPP+1sKGlOnz4dc+fOjX379sU///nP6O/vj40bN8at\nt95a9GjMMEKUNDt37ozz589fc/+TTz4ZK1eujIiIX//619HV1RWrV6/OHo82cb0PCu/o6ChgEtrR\npUuXolarxcaNG6NcLhc9Di3unXfeiXnz5kV/f38cP3686HFoE59//nmcOnUqNm3aFHfddVccPHgw\nDh8+HE8++WTRozHDCFHSvPjii1/59TfeeCPeeeed2LFjhzBg2vT09MSZM2eu3j5z5kx0d3cXOBHt\nYnR0NGq1WqxevTpWrVpV9Di0gRMnTsQf//jH+NOf/hT/+9//4uLFi7Fnz574wQ9+UPRotLCenp7o\n6emJu+66KyIivvWtb8Xhw4cLnoqZSIgyI7z33nvx29/+Nn7yk5/ELbfcUvQ4tLA77rgjPv744zh9\n+nTMnz8/3nrrLW/KmHaXL1+O/fv3R19fXzz66KNFj0ObeOqpp+Kpp56KiIjjx4/H7373O//fMe1u\nu+226OnpiY8++ih6e3vjr3/9qw9m47qEKDPCgQMHYnR0NHbu3BkREXfddVds3ry54KloRV1dXbFp\n06bYtWtXjI2NxQMPPBCLFi0qeixa3IkTJ+Lo0aOxePHi2LZtW0RErF+/PpYvX17wZADNt2nTptiz\nZ0+Mjo7G17/+9di6dWvRIzEDdVy+3i9MAQAAwDTx51sAAABIJUQBAABIJUQBAABIJUQBAABIJUQB\nAABIJUQBAABIJUQBAABIJUQBAABI9X8A6V9SWp8w1HoAAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib\n", + "matplotlib.style.use('ggplot')\n", + "\n", + "data = list(map(lambda x: x[1], sorted_scores))\n", + "fig = plt.figure(figsize=(16,12))\n", + "ax = fig.add_subplot(111)\n", + "ax.hist(data, bins=15, color='blue', rwidth=0.8, alpha=0.5)\n", + "# locator = mdates.AutoDateLocator()\n", + "# ax.xaxis.set_major_locator(locator)\n", + "# ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(locator))\n", + "# plt.ylabel('num. posts')\n", + "# plt.xlabel('UTC time')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4 is the threshold" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 (cnn, hdtalk)\n", + "1 (cnn, msnbc)\n", + "2 (comey, kelly)\n", + "3 (usa, americafirst)\n", + "4 (p2, aca)\n", + "5 (tcot, pjnet)\n", + "6 (politics, hdtalk)\n", + "7 (theresistance, impeachtrump)\n", + "8 (tcot, p2)\n", + "9 (mondaymotivation, northkorea)\n", + "10 (p2, americafirst)\n", + "11 (kelly, msnbc)\n", + "12 (msnbc, americafirst)\n", + "13 (comey, msnbc)\n", + "14 (kelly, themooch)\n", + "15 (potus, donaldtrump)\n", + "16 (resist, theresistance)\n", + "17 (whitehousechaos, themooch)\n", + "18 (blacklivesmatter, msnbc)\n", + "19 (obamacare, aca)\n", + "20 (p2, usa)\n", + "21 (usa, iran)\n", + "22 (cnn, kelly)\n", + "23 (northkorea, russia)\n", + "24 (usa, cnn)\n", + "25 (usa, crime)\n", + "26 (tcot, usa)\n", + "27 (cnn, comey)\n", + "28 (trumptrain, americafirst)\n", + "Name: tag_pairs, dtype: object" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get top hashtag pairs based on histogram dist.\n", + "ds_pmi_scores = pd.DataFrame(sorted_scores, columns=('tag_pairs', 'score'))\n", + "ds_top_pmi = ds_pmi_scores[ds_pmi_scores.score >= 4]\n", + "ser = ds_top_pmi['tag_pairs']\n", + "ser" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Group related tags together" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'americafirst',\n", + " 'blacklivesmatter',\n", + " 'cnn',\n", + " 'comey',\n", + " 'crime',\n", + " 'hdtalk',\n", + " 'kelly',\n", + " 'msnbc',\n", + " 'politics',\n", + " 'tcot',\n", + " 'themooch',\n", + " 'trumptrain',\n", + " 'usa',\n", + " 'whitehousechaos'},\n", + " {'comey', 'kelly'},\n", + " {'americafirst', 'iran', 'p2', 'usa'},\n", + " {'aca', 'obamacare', 'p2', 'tcot'},\n", + " {'pjnet', 'tcot'},\n", + " {'impeachtrump', 'resist', 'theresistance'},\n", + " {'mondaymotivation', 'northkorea', 'russia'},\n", + " {'donaldtrump', 'potus'}]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "synonyms=[set(ser[0])]\n", + "\n", + "for a,b in ser:\n", + " for s in synonyms:\n", + " if a in s or b in s:\n", + " s.add(a);s.add(b)\n", + " break\n", + " else:\n", + " synonyms.append(set((a,b)))\n", + " \n", + " \n", + "synonyms" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Those look really good. Nice job, Joe!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "## TODO: use synonyms in silk_specter fasttext modeler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I've done so in another messy notebook, and the precision ticked up a couple of points." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/services/silk-specter/main.py b/services/silk-specter/main.py new file mode 100644 index 0000000..7950e01 --- /dev/null +++ b/services/silk-specter/main.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +import sys, os, time, traceback +from fast_text_modeler import Model +sys.path.append(os.path.join(os.path.dirname(__file__), '../util')) +from redis_dispatcher import Dispatcher + +def set_err(job, msg): + job['state'] = 'error' + job['data'] = [] + job['error'] = msg + +def err_check(job): + # TODO: use lang? + required = {'start_time_ms', 'end_time_ms'} + if not required.issubset(job): + set_err(job, 'Missing some required fields {}'.format(required)) + +def process_message(key, job): + err_check(job) + if job['state'] == 'error': + return + + start_time = int(job['start_time_ms']) + end_time = int(job['end_time_ms']) + kafka_url = job['kafka_url'] if 'kafka_url' in job else 'print' + kafka_topic = job['kafka_topic'] if 'kafka_topic' in job else 'print' + + try: + model = Model() + + train_start_time = time.time() + model.train(start_time, end_time) + + predict_start_time = time.time() + model.predict(start_time, end_time, kafka_url, kafka_topic) + + print("--- %.0f sec. (train) ---" % (predict_start_time - train_start_time)) + print("--- %.0f sec. (predict) ---" % (time.time() - predict_start_time)) + + except Exception as e: + traceback.print_exc() + set_err(job, str(e)) + return + + job['data'] = [] # output sent to kafka + job['state'] = 'processed' + return + +if __name__ == '__main__': + dispatcher = Dispatcher(redis_host='redis', + process_func=process_message, + queues=['genie:topic_model']) + dispatcher.start() + + + # model = Model() + # model.train() + # model.predict(kafka_topic='abc', kafka_url='print') diff --git a/services/silk-specter/requirements.txt b/services/silk-specter/requirements.txt new file mode 100644 index 0000000..9b12981 --- /dev/null +++ b/services/silk-specter/requirements.txt @@ -0,0 +1,4 @@ +redis +stop_words +fasttext +kafka-python diff --git a/services/silk-specter/tokenizer.py b/services/silk-specter/tokenizer.py new file mode 100644 index 0000000..f3d9ee6 --- /dev/null +++ b/services/silk-specter/tokenizer.py @@ -0,0 +1,40 @@ +# def: text tokenize util. +# modified from util/sentiment_filters.py + +import re, traceback + +def is_special(word): + if not word: return True + if word[0]=='#' or word[0]=='@' or word[-1]=='#' or word[-1]=='@': + return True + return word.isdigit() + +def is_url(word): + if len(word) > 4 and (word[:4] == 'http' or word[:3] == 'www'): + return True + return False + +def pres_tokenize(caption, lang, b_filter_special=True, b_filter_url=True): + if lang=='en': + caption = re.sub('^rt ','', caption.lower(), flags=re.UNICODE) + # rm newlines + caption = re.sub('[\s]',' ', caption, flags=re.UNICODE) + # keep alphanums + hashtags + mentions + caption = re.sub('[^\w\s#@]','', caption, flags=re.UNICODE) + caption = list(filter(lambda x: x!='', caption.strip().split(' '))) + tokens = caption + # optionally rm hashtags, mentions, urls + if b_filter_special: + tokens = filter(lambda x: is_special(x) is not True, tokens) + if b_filter_url: + tokens = filter(lambda x: is_url(x) is not True, tokens) + return list(tokens) + # elif lang=='ar': + # try: + # caption = re.sub('[#]', ' ',caption, flags=re.UNICODE) + # return list(filter(lambda x: len(x)>1, Text(caption).words)) + # except: + # traceback.print_exc() + # return [] + else: + return [] diff --git a/services/silk-specter/topic_modeler.py b/services/silk-specter/topic_modeler.py new file mode 100644 index 0000000..bf53281 --- /dev/null +++ b/services/silk-specter/topic_modeler.py @@ -0,0 +1,21 @@ +from gensim.models.ldamodel import LdaModel +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), "../util")) +from sentiment_filters import SentimentFilter + +class TopicModeler: + def __init__(self): + self.models = {} + self.sf = SentimentFilter() + + def langs(self): + return self.models.keys() + + def load_lang(self, lang, model_path, model_name): + if model_path[-1] != "/": + model_path = model_path + "/" + self.models[lang] = LdaModel.load(model_path + model_name) + + def assign_topics(self, lang, text): + tokens = self.sf.tokenize(text, lang) + return self.models[lang][self.models[lang].id2word.doc2bow(tokens)] diff --git a/services/util/date_utils.py b/services/util/date_utils.py index afe8856..df530e9 100644 --- a/services/util/date_utils.py +++ b/services/util/date_utils.py @@ -16,7 +16,7 @@ def str_to_dt(str_dt, form=1): if form==1: #ex: 2014-08-10T01:59:31.979Z - GNIP Raw data, mongo SMP "created" return datetime.strptime(str_dt[:-5], "%Y-%m-%dT%H:%M:%S") else: - print "Unrecongnized format" + print("Unrecongnized format") return None def ms_time_interval(dt1, dt2): diff --git a/services/util/mongo_spark_client.py b/services/util/mongo_spark_client.py index 1d8b038..09b10d9 100644 --- a/services/util/mongo_spark_client.py +++ b/services/util/mongo_spark_client.py @@ -15,8 +15,8 @@ def __init__(self, master='local[*]', uri='mongodb://mongo:27017', self.sparkContext.setLogLevel(getenv('SPARK_LOG_LEVEL', 'ERROR')) self.mongo_uri = dict(uri=uri, database=db, collection=collection) - def read(self): - return self.spark.read.load(format=mongo_fmt, **self.mongo_uri) + def read(self, schema=None): + return self.spark.read.load(format=mongo_fmt, schema=schema, **self.mongo_uri) def write(self, df, mode='append'): df.write.format(mongo_fmt).mode(mode).options(**self.mongo_uri).save() diff --git a/services/util/nb_utils.py b/services/util/nb_utils.py new file mode 100644 index 0000000..a56406d --- /dev/null +++ b/services/util/nb_utils.py @@ -0,0 +1,29 @@ +# misc. jupyter nb helper functions + settings. + +import pandas as pd +import numpy as np + +# output all lines, not just the last one, in each cell. +from IPython.core.interactiveshell import InteractiveShell +InteractiveShell.ast_node_interactivity = 'all' + +# wider pd dataframe columns. +pd.set_option('max_colwidth', 800) + +# pretty-print pd entiy or spark dataframe info. +def pp(df, limit=10, raw=False): + if type(df) == np.ndarray: + return pd.DataFrame(df).head(limit) + elif type(df) == pd.core.frame.DataFrame: + print(df.shape) + return df.head(limit) + elif type(df) == pd.core.series.Series: + print(df.shape) + return df.head(limit) + else: + print(df.schema) + print('count: ', df.count()) + if raw: + return df.show(limit) + else: + return df.limit(limit).toPandas().head(limit) diff --git a/services/util/sentiment_filters.py b/services/util/sentiment_filters.py index fa72a52..9a88245 100644 --- a/services/util/sentiment_filters.py +++ b/services/util/sentiment_filters.py @@ -1,12 +1,12 @@ import re import traceback -from polyglot.text import Text +# from polyglot.text import Text from stop_words import get_stop_words class SentimentFilter: def __init__(self): self.good_langs = ['en', 'ar', 'ru'] - black_list = ['rt', 'amp'] + black_list = ['rt', 'amp', 'man', 'you', 'i', 'we', 'me', 'he', 'she', 'they', 'them'] self.stop = {'ar':set(get_stop_words('ar') + black_list), 'en':set(map(lambda x: re.sub('[^\w\s]', '', x, flags=re.UNICODE) ,get_stop_words('en')+black_list))}