Skip to content

Commit

Permalink
fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
inhumantsar committed Dec 10, 2024
1 parent 94ac3c5 commit 3863178
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 38 deletions.
112 changes: 82 additions & 30 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -1607,6 +1607,58 @@ Readability.prototype = {
});
},

_extractJSONLDMetadata: function (parsed) {
var metadata = {};

if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) {
// we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
// put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
// "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.

var title = this._getArticleTitle();
var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75;

if (headlineMatches && !nameMatches) {
metadata.title = parsed.headline;
} else {
metadata.title = parsed.name;
}
} else if (typeof parsed.name === "string") {
metadata.title = parsed.name.trim();
} else if (typeof parsed.headline === "string") {
metadata.title = parsed.headline.trim();
}
if (parsed.author) {
if (typeof parsed.author.name === "string") {
metadata.byline = parsed.author.name.trim();
} else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
metadata.byline = parsed.author
.filter(function(author) {
return author && typeof author.name === "string";
})
.map(function(author) {
return author.name.trim();
})
.join(", ");
}
}
if (typeof parsed.description === "string") {
metadata.excerpt = parsed.description.trim();
}
if (
parsed.publisher &&
typeof parsed.publisher.name === "string"
) {
metadata.siteName = parsed.publisher.name.trim();
}
if (typeof parsed.datePublished === "string") {
metadata.datePublished = parsed.datePublished.trim();
}

return metadata;
},

/**
* Try to extract metadata from JSON-LD object.
* For now, only Schema.org objects of type Article or its subtypes are supported.
Expand All @@ -1630,13 +1682,6 @@ Readability.prototype = {
);
var parsed = JSON.parse(content);

// some sites, like ones for academic journals, separate metadata for a journal article or paper from the
// site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
// would be invisible without this.
if (parsed["mainEntity"]) {
parsed = parsed["mainEntity"];
}

if (
!parsed["@context"] ||
!parsed["@context"].match(/^https?\:\/\/schema\.org\/?$/)
Expand All @@ -1660,7 +1705,7 @@ Readability.prototype = {
return;
}

metadata = {};
metadata = this._extractJSONLDMetadata(parsed);

if (
typeof parsed.name === "string" &&
Expand Down Expand Up @@ -1713,6 +1758,13 @@ Readability.prototype = {
if (typeof parsed.datePublished === "string") {
metadata.datePublished = parsed.datePublished.trim();
}
// some sites, like ones for academic journals, separate metadata for a journal article or paper from the
// site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
// would be invisible unless we retry using mainEntity.
if (parsed["mainEntity"] && Object.keys(metadata).length === 0) {
metadata = this._extractJSONLDMetadata(parsed["mainEntity"]);
}

} catch (err) {
this.log(err.message);
}
Expand All @@ -1728,25 +1780,18 @@ Readability.prototype = {
* @returns Name or names in "GivenName Surname" format
*/
_normalizeByline: function(name) {
if (!name) {
return name;
}

var result = name;

if (Array.isArray(name)) {
return name.map((n) => this._normalizeByline(n));
}

// handle Surname, GivenName formatting
if (name.includes(",")) {
const parts = name.split(",").map(part => part.trim());
if (parts.length == 2) {
result = `${parts[1]} ${parts[0]}`;
}
if (parts.length > 2) {
result = `${parts[1]} ${parts[0]} ${parts.slice(2).join(" ")}`;
}
}

// remove things like "By:"
result = result.replace(/\w+:/, "");
// remove things like "By:" and "http://"
result = result.replace(/\w+:\/{0,2}/, "");

return this._unescapeHtmlEntities(result);
},
Expand All @@ -1771,6 +1816,11 @@ Readability.prototype = {
// name is a single value
var namePattern = /^\s*(?:(prism|citation|dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-_\.:]\s*)?(author|creator|pub-date|publicationDate|publication|description|title|site_name)\s*$/i;

// fields which are permitted to have multiple distinct values, eg: byline
var byline_properties = [ "dc:creator", "dcterm:creator", "author", "parsely-author", "citation_author"];
var multi_props = byline_properties; // concat others here. somewhat pointless atm, but there will be more...


// Find description tags.
this._forEachNode(metaElements, function (element) {
var elementName = element.getAttribute("name");
Expand Down Expand Up @@ -1803,18 +1853,19 @@ Readability.prototype = {
}
}

// handle properties which might have multiple distinct values, eg: citation_author
if (result) {
if (values[name]) {
// handle properties which might have multiple distinct values
if (values[name] && multi_props.includes(name)) {
if (Array.isArray(values[name]) && typeof result == "string") {
values[name].push(result);
}
if (typeof values[name] == "string") {
if (typeof values[name] == "string" && values[name] !== result) {
values[name] = [values[name], result];
}
} else {
values[name] = result;
}

this.log(`found metadata: ${name}=${values[name]}`);
}
});
Expand All @@ -1836,12 +1887,12 @@ Readability.prototype = {
}

// get author
metadata.byline = jsonld.byline ||
values["dc:creator"] ||
values["dcterm:creator"] ||
values.author ||
values["parsely-author"] ||
values["citation_author"];
metadata.byline = jsonld.byline;
for (const n of byline_properties) {
if (metadata.byline)
break;
metadata.byline = values[n];
}

// get description
metadata.excerpt =
Expand Down Expand Up @@ -1872,6 +1923,7 @@ Readability.prototype = {
metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
metadata.publishedTime = this._unescapeHtmlEntities(metadata.publishedTime);
this.log(`getArticleMetadata complete: ${JSON.stringify(metadata)}`);

return metadata;
},
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"title": "Dublin Core property title",
"byline": "Dublin Core property author",
"byline": "Dublin Core author",
"dir": null,
"excerpt": "Dublin Core property description",
"siteName": null,
Expand Down
8 changes: 5 additions & 3 deletions test/test-pages/003-metadata-preferred/source.html
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
<meta property="twitter:title" content="Twitter property title"/>
<meta property="og:title" content="Open Graph property title"/>
<meta name="author" content="Meta name author"/>
<meta name="DC.creator" content="Dublin Core name author"/>
<meta property="dc:creator" content="Dublin Core property author"/>
<meta name="description" content="Meta name description"/>
<!-- now that multiple authors are supported, these have to be identical to prevent them from showing up
as two separate authors -->
<meta name="DC.creator" content="Dublin Core author"/>
<meta property="dc:creator" content="Dublin Core author"/>
<meta name="description" content="Meta name description"/>
<meta name="og:description" content="Open Graph name description"/>
<meta name="twitter:description" content="Twitter name description"/>
<meta name="DC.description" content="Dublin Core name description"/>
Expand Down
2 changes: 1 addition & 1 deletion test/test-pages/ebb-org/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"title": "On Recent Controversial Events - Bradley M. Kuhn ( Brad ) ( bkuhn )",
"byline": "Bradley M. Kuhn (http://ebb.org/bkuhn/)",
"byline": "Bradley M. Kuhn (ebb.org/bkuhn/)",
"dir": null,
"lang": "en-US",
"excerpt": "The website of Bradley M. Kuhn, aka Brad, aka bkuhn. This site includes his GPG keys, resume, blog, projects list, software, interviews, speeches and writing.",
Expand Down
5 changes: 4 additions & 1 deletion test/test-pages/ietf-1/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
{
"title": "remoteStorage",
"byline": "Jong, Michiel de",
"byline": [
"Kooman, F.",
"Jong, Michiel de"
],
"dir": null,
"lang": "en",
"siteName": null,
Expand Down
4 changes: 2 additions & 2 deletions test/test-pages/nature/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"title": "Worldwide divergence of values",
"byline": [
"Joshua Conrad Jackson",
"Danila Medvedev"
"Jackson, Joshua Conrad",
"Medvedev, Danila"
],
"dir": null,
"lang": "en",
Expand Down

0 comments on commit 3863178

Please sign in to comment.