Skip to content
This repository has been archived by the owner on Dec 30, 2021. It is now read-only.

Commit

Permalink
Add readme, update code, update package.json
Browse files Browse the repository at this point in the history
  • Loading branch information
s0ph1e committed Mar 13, 2017
1 parent ed29119 commit 940d3c7
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 25 deletions.
25 changes: 23 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,23 @@
WORK IN PROGRESS, PLUGIN IS NOT READY YET!
========================
## Introduction
Plugin for [website-scraper](https://github.com/s0ph1e/node-website-scraper) which returns html for dynamic websites using PhantomJS.

## Installation
```sh
npm install website-scraper website-scraper-phantom
```

## Usage
```javascript
const scrape = require('website-scraper');
const phantomHtml = require('website-scraper-phantom');

scrape({
urls: ['https://www.instagram.com/gopro/'],
directory: '/path/to/save',
httpResponseHandler: phantomHtml
}).then(console.log).catch(console.log);
```

## How it works
It starts PhantomJS which just opens page and waits when page is loaded.
It is far from ideal because probably you need to wait until some resource is loaded or click some button or log in. Currently this module doesn't support such functionality.
10 changes: 6 additions & 4 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
var Promise = require('bluebird');
var getPhantomHtml = require('./get-phantom-html.js');
'use strict';

const Promise = require('bluebird');
const getPhantomHtml = require('./src/get-phantom-html.js');

/**
* Makes phantom request if response contains html, returns original response body otherwise
* @param {Object} response - response object from `request` module
* @return {Promise} - resolved with body if success, rejected if error
*/
module.exports = (response) => {
var contentType = response.headers['content-type'];
var isHtml = contentType && contentType.split(';')[0] === 'text/html';
const contentType = response.headers['content-type'];
const isHtml = contentType && contentType.split(';')[0] === 'text/html';
if (isHtml) {
return getPhantomHtml(response.request.href);
} else {
Expand Down
20 changes: 13 additions & 7 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
{
"name": "website-scraper-phantom-html",
"version": "0.0.0",
"description": "",
"name": "website-scraper-phantom",
"version": "0.1.0",
"description": "Plugin for website-scraper which receives html for dynamic websites using PhantomJS",
"readmeFilename": "README.md",
"main": "index.js",
"keywords": [
"website-scraper",
"phantomjs",
"html"
],
"dependencies": {
"phantomjs-prebuilt": "^2.1.14",
"system": "^1.2.0",
Expand All @@ -15,12 +21,12 @@
},
"repository": {
"type": "git",
"url": "git+https://github.com/s0ph1e/node-website-scraper-phantom-html.git"
"url": "git+https://github.com/s0ph1e/node-website-scraper-phantom.git"
},
"author": "s0ph1e",
"author": "Sophia Antipenko <[email protected]>",
"license": "MIT",
"bugs": {
"url": "https://github.com/s0ph1e/node-website-scraper-phantom-html/issues"
"url": "https://github.com/s0ph1e/node-website-scraper-phantom/issues"
},
"homepage": "https://github.com/s0ph1e/node-website-scraper-phantom-html#readme"
"homepage": "https://github.com/s0ph1e/node-website-scraper-phantom"
}
18 changes: 10 additions & 8 deletions get-phantom-html.js → src/get-phantom-html.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
var path = require('path');
var phantomjs = require('phantomjs-prebuilt');
var Promise = require('bluebird');
'use strict';

var scriptPath = path.join(__dirname, 'script.js');
const path = require('path');
const phantomjs = require('phantomjs-prebuilt');
const Promise = require('bluebird');

module.exports = function (url) {
const scriptPath = path.join(__dirname, 'script.js');

module.exports = (url) => {
return new Promise((resolve, reject) => {
var program = phantomjs.exec(scriptPath, url);
var stdout = '';
var stderr = '';
const program = phantomjs.exec(scriptPath, url);
let stdout = '';
let stderr = '';

program.stdout.on('data', (data) => {
stdout += data;
Expand Down
5 changes: 1 addition & 4 deletions script.js → src/script.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,10 @@ function done() {
phantom.exit(0);
}

page.onLoadFinished = function() {
setTimeout(done, 1000);
};

page.open(url, function (status) {
if (status !== 'success') {
system.stderr.write('Can\'t open page');
phantom.exit(1);
}
setTimeout(done, 1000);
});

0 comments on commit 940d3c7

Please sign in to comment.