Skip to content

Commit

Permalink
Update CPM crawler (#103)
Browse files Browse the repository at this point in the history
* Update autoconsent to 12.0.0

* CMPCollector relies on heuristics provided by autoconsent

* Remove debug logs

* Revert the cluster changes in clickhouse

* Do not produce completely empty cmp results

* Move a log message

* Lint fix

* Bump autoconsent

* Fix CMPCollector tests

* Mitigate linting errors
  • Loading branch information
muodov authored Jan 8, 2025
1 parent 7231f25 commit fb6e5cd
Show file tree
Hide file tree
Showing 6 changed files with 263 additions and 168 deletions.
129 changes: 31 additions & 98 deletions collectors/CMPCollector.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
/* eslint-disable max-lines */
const fs = require('fs');
const createDeferred = require('../helpers/deferred');
const waitFor = require('../helpers/waitFor');
const BaseCollector = require('./BaseCollector');

Expand All @@ -15,12 +14,12 @@ const BaseCollector = require('./BaseCollector');
* @typedef { import('@duckduckgo/autoconsent/lib/messages').OptOutResultMessage } OptOutResultMessage
* @typedef { import('@duckduckgo/autoconsent/lib/messages').OptInResultMessage } OptInResultMessage
* @typedef { import('@duckduckgo/autoconsent/lib/messages').DoneMessage } DoneMessage
* @typedef { { snippets: string[], patterns: string[] } } ScanResult
* @typedef { { snippets: Set<string>, patterns: Set<string>, filterListMatched: boolean } } ScanResult
*/

// @ts-ignore
const baseContentScript = fs.readFileSync(
require.resolve('@duckduckgo/autoconsent/dist/autoconsent.playwright.js'),
require.resolve('../node_modules/@duckduckgo/autoconsent/dist/autoconsent.playwright.js'),
'utf8'
);

Expand All @@ -46,32 +45,6 @@ function isIgnoredEvalError(e) {
);
}

// TODO: check for false positive detections per pattern
const DETECT_PATTERNS = [
/accept cookies/ig,
/accept all/ig,
/reject all/ig,
/only necessary cookies/ig, // "only necessary" is probably too broad
/by clicking.*(accept|agree|allow)/ig,
/by continuing/ig,
/we (use|serve)( optional)? cookies/ig,
/we are using cookies/ig,
/use of cookies/ig,
/(this|our) (web)?site.*cookies/ig,
/cookies (and|or) .* technologies/ig,
/such as cookies/ig,
/read more about.*cookies/ig,
/consent to.*cookies/ig,
/we and our partners.*cookies/ig,
/we.*store.*information.*such as.*cookies/ig,
/store and\/or access information.*on a device/ig,
/personalised ads and content, ad and content measurement/ig,

// it might be tempting to add the patterns below, but they cause too many false positives. Don't do it :)
// /cookies? settings/i,
// /cookies? preferences/i,
];

class CMPCollector extends BaseCollector {
id() {
return 'cmps';
Expand All @@ -88,12 +61,12 @@ class CMPCollector extends BaseCollector {
this.receivedMsgs = [];
this.selfTestFrame = null;
this.isolated2pageworld = new Map();
this.pendingScan = createDeferred();
this.context = options.context;
/** @type {ScanResult} */
this.scanResult = {
snippets: [],
patterns: [],
snippets: new Set([]),
patterns: new Set([]),
filterListMatched: false,
};
}

Expand Down Expand Up @@ -189,10 +162,12 @@ class CMPCollector extends BaseCollector {
/** @type {Partial<AutoconsentConfig>} */
const autoconsentConfig = {
enabled: true,
autoAction: null, // we request action explicitly later
autoAction: 'optOut',
disabledCmps: [],
enablePrehide: false,
enableCosmeticRules: true,
enableFilterList: true,
enableHeuristicDetection: true,
detectRetries: 20,
isMainWorld: false
};
Expand All @@ -203,14 +178,14 @@ class CMPCollector extends BaseCollector {
break;
}
case 'popupFound':
if (this.autoAction) {
await this.pendingScan.promise; // wait for the pattern detection first
await this._cdpClient.send('Runtime.evaluate', {
expression: `autoconsentReceiveMessage({ type: "${this.autoAction}" })`,
contextId: executionContextId,
});
if (msg.cmp === 'filterList') {
this.scanResult.filterListMatched = true;
}
break;
case 'report':
msg.state.heuristicPatterns.forEach(x => this.scanResult.patterns.add(x));
msg.state.heuristicSnippets.forEach(x => this.scanResult.snippets.add(x));
break;
case 'optInResult':
case 'optOutResult': {
if (msg.scheduleSelfTest) {
Expand Down Expand Up @@ -315,44 +290,6 @@ class CMPCollector extends BaseCollector {
}
}

async postLoad() {
/**
* @type {string[]}
*/
const foundPatterns = [];
const foundSnippets = [];
const pages = await this.context.pages();
if (pages.length > 0) {
const page = pages[0];
/**
* @type {Promise<string>[]}
*/
const promises = [];
page.frames().forEach(frame => {
// eslint-disable-next-line no-undef
promises.push(frame.evaluate(() => document.documentElement.innerText).catch(reason => {
this.log(`error retrieving text: ${reason}`);
// ignore exceptions
return '';
}));
});
const texts = await Promise.all(promises);
const allTexts = texts.join('\n');
for (const p of DETECT_PATTERNS) {
const matches = allTexts.match(p);
if (matches) {
foundPatterns.push(p.toString());
foundSnippets.push(...matches.map(m => m.substring(0, 200)));
}
}
}
this.pendingScan.resolve();
this.scanResult = {
patterns: foundPatterns,
snippets: Array.from(new Set(foundSnippets)),
};
}

/**
* @returns {CMPResult[]}
*/
Expand Down Expand Up @@ -394,8 +331,9 @@ class CMPCollector extends BaseCollector {
succeeded: false,
selfTestFail: Boolean(selfTestResult && !selfTestResult.result),
errors,
patterns: [],
snippets: [],
patterns: Array.from(this.scanResult.patterns),
snippets: Array.from(this.scanResult.snippets),
filterListMatched: this.scanResult.filterListMatched,
};

const found = this.findMessage({type: 'popupFound', cmp: msg.cmp});
Expand Down Expand Up @@ -427,25 +365,19 @@ class CMPCollector extends BaseCollector {
async getData() {
await this.waitForFinish();
const results = this.collectResults();
if (this.scanResult.patterns.length > 0) {
if (results.length > 0) {
results.forEach(r => {
r.patterns = this.scanResult.patterns;
r.snippets = this.scanResult.snippets;
});
} else {
results.push({
final: false,
name: '',
open: false,
started: false,
succeeded: false,
selfTestFail: false,
errors: [],
patterns: this.scanResult.patterns,
snippets: this.scanResult.snippets,
});
}
if (this.scanResult.patterns.size > 0 && results.length === 0) {
results.push({
final: false,
name: '',
open: false,
started: false,
succeeded: false,
selfTestFail: false,
errors: [],
patterns: Array.from(this.scanResult.patterns),
snippets: Array.from(this.scanResult.snippets),
filterListMatched: this.scanResult.filterListMatched,
});
}
return results;
}
Expand All @@ -462,6 +394,7 @@ class CMPCollector extends BaseCollector {
* @property {string[]} errors
* @property {string[]} patterns
* @property {string[]} snippets
* @property {boolean} filterListMatched
*/

module.exports = CMPCollector;
1 change: 1 addition & 0 deletions crawlerConductor.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstPa
*/
const prefixedLog = (...msg) => log(chalk.gray(`${url.hostname}:`), ...msg);

// @ts-expect-error - outdated node types
const data = await crawl(url, {
log: prefixedLog,
// @ts-ignore
Expand Down
Loading

0 comments on commit fb6e5cd

Please sign in to comment.