Skip to content

Commit

Permalink
feat: fuzzy scraping (#572)
Browse files Browse the repository at this point in the history
  • Loading branch information
GeoffreyChen777 authored Jun 28, 2024
1 parent f38a92b commit 9c97327
Show file tree
Hide file tree
Showing 16 changed files with 401 additions and 53 deletions.
4 changes: 3 additions & 1 deletion app/locales/locales/en.GB.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@
"historyreleasenote": "History Release Note",
"feed": "Feed",
"feedTime": "Feed Time",
"abstract": "Abstract"
"abstract": "Abstract",
"foundcandidates": "Found Candidates"
},
"smartfilter": {
"startops": "Start Ops",
Expand Down Expand Up @@ -73,6 +74,7 @@
"menu": {
"rescrape": "Scrape",
"rescrapefrom": "Scrape from",
"fuzzyscrape": "Fuzzily Scrape",
"removefrom": "Remove from",
"delete": "Delete",
"edit": "Edit",
Expand Down
4 changes: 3 additions & 1 deletion app/locales/locales/zh.CN.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@
"historyreleasenote": "历史版本",
"feed": "订阅",
"feedTime": "更新时间",
"abstract": "摘要"
"abstract": "摘要",
"foundcandidates": "找到候选匹配"
},
"smartfilter": {
"startops": "起始运算符",
Expand Down Expand Up @@ -73,6 +74,7 @@
"menu": {
"rescrape": "搜寻元数据",
"rescrapefrom": "从 ... 搜寻元数据",
"fuzzyscrape": "模糊搜寻元数据",
"removefrom": "从 ... 移除",
"delete": "删除",
"edit": "编辑",
Expand Down
4 changes: 3 additions & 1 deletion app/locales/locales/zh.TW.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@
"historyreleasenote": "歷史版本說明",
"feed": "訂閱",
"feedTime": "更新時間",
"abstract": "摘要"
"abstract": "摘要",
"foundcandidates": "找到候選匹配"
},
"smartfilter": {
"startops": "起始運算子",
Expand Down Expand Up @@ -73,6 +74,7 @@
"menu": {
"rescrape": "搜尋元數據",
"rescrapefrom": "從 ... 搜尋元數據",
"fuzzyscrape": "模糊搜尋元數據",
"removefrom": "從 ... 移除",
"delete": "刪除",
"edit": "編輯",
Expand Down
51 changes: 29 additions & 22 deletions app/main/services/contextmenu-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ export interface IContextMenuServiceState {
dataContextMenuShowInFinderClicked: number;
dataContextMenuEditClicked: number;
dataContextMenuScrapeClicked: number;
dataContextMenuFuzzyScrapeClicked: number;
dataContextMenuDeleteClicked: number;
dataContextMenuFlagClicked: number;
dataContextMenuExportBibTexClicked: number;
Expand Down Expand Up @@ -114,6 +115,7 @@ export class ContextMenuService extends Eventable<IContextMenuServiceState> {
dataContextMenuShowInFinderClicked: 0,
dataContextMenuEditClicked: 0,
dataContextMenuScrapeClicked: 0,
dataContextMenuFuzzyScrapeClicked: 0,
dataContextMenuDeleteClicked: 0,
dataContextMenuFlagClicked: 0,
dataContextMenuExportBibTexClicked: 0,
Expand Down Expand Up @@ -234,40 +236,32 @@ export class ContextMenuService extends Eventable<IContextMenuServiceState> {
},
},
{ type: "separator" },
{
label: this._locales.t("menu.edit"),
enabled: allowEdit,
accelerator: preferenceService.get("shortcutEdit") as string,
click: () => {
this.fire("dataContextMenuEditClicked");
},
},
{
label: this._locales.t("menu.rescrape"),
accelerator: preferenceService.get("shortcutScrape") as string,
click: () => {
this.fire("dataContextMenuScrapeClicked");
},
},

{
label: this._locales.t("menu.rescrapefrom"),
submenu: scraperMenuTemplate,
},
{
label: this._locales.t("menu.removefrom"),
submenu: [
{
label: this._locales.t("mainview.folders"),
submenu: removeFolderMenuTemplate,
},
{
label: this._locales.t("mainview.tags"),
submenu: removeTagMenuTemplate,
},
],
label: this._locales.t("menu.fuzzyscrape"),
click: () => {
this.fire("dataContextMenuFuzzyScrapeClicked");
},
},
{ type: "separator" },
{
label: this._locales.t("menu.edit"),
enabled: allowEdit,
accelerator: preferenceService.get("shortcutEdit") as string,
click: () => {
this.fire("dataContextMenuEditClicked");
},
},

{
label: this._locales.t("menu.delete"),
accelerator: preferenceService.get("shortcutDelete") as string,
Expand All @@ -282,6 +276,19 @@ export class ContextMenuService extends Eventable<IContextMenuServiceState> {
this.fire("dataContextMenuFlagClicked");
},
},
{
label: this._locales.t("menu.removefrom"),
submenu: [
{
label: this._locales.t("mainview.folders"),
submenu: removeFolderMenuTemplate,
},
{
label: this._locales.t("mainview.tags"),
submenu: removeTagMenuTemplate,
},
],
},
{ type: "separator" },
{
label: this._locales.t("menu.export"),
Expand Down Expand Up @@ -555,7 +562,7 @@ export class ContextMenuService extends Eventable<IContextMenuServiceState> {
click: () => {
this.fire({ supContextMenuRenameClicked: fileURL });
},
}
},
];
const menu = Menu.buildFromTemplate(template);
menu.popup();
Expand Down
2 changes: 2 additions & 0 deletions app/renderer/global.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { ShortcutService } from "@/renderer/services/shortcut-service";
import { SmartFilterService } from "@/renderer/services/smartfilter-service";
import { UISlotService } from "@/renderer/services/uislot-service";
import { UIStateService } from "@/renderer/services/uistate-service";
import { ScrapeService } from "@/renderer/services/scrape-service";

declare global {
var preferenceService: PreferenceService;
Expand All @@ -39,4 +40,5 @@ declare global {
var uiStateService: UIStateService;
var uiSlotService: UISlotService;
var querySentenceService: QuerySentenceService;
var scrapeService: ScrapeService
}
22 changes: 6 additions & 16 deletions app/renderer/services/paper-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,27 +103,17 @@ export class PaperFilterOptions implements IPaperFilterOptions {
}

static checkIsDateFilter(dateFilter: string) {
return dateFilter.match(
/(<|<=|>|>=)\s*\[\d+ DAYS\]/g
);
return dateFilter.match(/(<|<=|>|>=)\s*\[\d+ DAYS\]/g);
}

static parseDateFilter(dateFilter: string) {
const compareDateMatch = dateFilter.match(
/(<|<=|>|>=)\s*\[\d+ DAYS\]/g
);
const compareDateMatch = dateFilter.match(/(<|<=|>|>=)\s*\[\d+ DAYS\]/g);
if (compareDateMatch) {
for (const match of compareDateMatch) {
if (dateFilter.includes("<")) {
dateFilter = dateFilter.replaceAll(
match,
match.replaceAll("<", ">")
);
dateFilter = dateFilter.replaceAll(match, match.replaceAll("<", ">"));
} else if (dateFilter.includes(">")) {
dateFilter = dateFilter.replaceAll(
match,
match.replaceAll(">", "<")
);
dateFilter = dateFilter.replaceAll(match, match.replaceAll(">", "<"));
}
}
}
Expand Down Expand Up @@ -595,7 +585,7 @@ export class PaperService extends Eventable<IPaperServiceState> {
paperEntity.supURLs = paperEntity.supURLs.map((supURL) => {
if (supURL === url) {
const realSupURL = supURL.split(":::").pop();
return `${name}:::${realSupURL}`
return `${name}:::${realSupURL}`;
} else {
return supURL;
}
Expand Down Expand Up @@ -746,7 +736,7 @@ export class PaperService extends Eventable<IPaperServiceState> {
if (this._preferenceService.get("allowRoutineMatch") as boolean) {
if (
Math.round(Date.now() / 1000) -
(this._preferenceService.get("lastRematchTime") as number) <
(this._preferenceService.get("lastRematchTime") as number) <
7 * 86400 - 10
) {
return;
Expand Down
109 changes: 109 additions & 0 deletions app/renderer/services/scrape-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { ILogService, LogService } from "@/common/services/log-service";
import { PaperEntity } from "@/models/paper-entity";
import { HookService, IHookService } from "@/renderer/services/hook-service";
import { ProcessingKey, processing } from "@/renderer/services/uistate-service";
import { IPaperEntityCollection } from "@/repositories/db-repository/paper-entity-repository";

export const IScrapeService = createDecorator("scrapeService");

Expand Down Expand Up @@ -243,4 +244,112 @@ export class ScrapeService extends Eventable<{}> {

return scrapedPaperEntityDrafts;
}

/**
* Scrape a data source's metadata.
* @param payloads - data source payloads.
* @returns List of paper entities' candidates. */
@processing(ProcessingKey.General)
@errorcatching(
"Failed to fuzzily scrape data source.",
true,
"ScrapeService",
[]
)
async fuzzyScrape(
paperEntities: IPaperEntityCollection
): Promise<Record<string, PaperEntity[]>> {
// 0. Wait for scraper extension to be ready.
await this._scrapeExtensionReady();

// Do in chunks 10
const jobID = Math.random().toString(36).substring(7);
const results: Record<string, PaperEntity[]> = {};
for (let i = 0; i < paperEntities.length; i += 10) {
if (paperEntities.length >= 20) {
this._logService.progress(
`Processing ${i} / ${paperEntities.length}...`,
(i / paperEntities.length) * 100,
true,
"ScrapeService",
jobID
);
}
try {
let paperEntityChunk = paperEntities.slice(i, i + 10);

const paperEntityDraftCandidates = await this._fuzzyScrape(
paperEntityChunk
);

paperEntityChunk.forEach((p, index) => {
results[`${p._id}`] = paperEntityDraftCandidates[index];
});
} catch (e) {
this._logService.error(
"Failed to fuzzily scrape data source.",
`${(e as Error).message} ${(e as Error).stack}`,
true,
"ScrapeService"
);
}
}

if (paperEntities.length >= 20) {
this._logService.progress(`Done!`, 100, true, "ScrapeService", jobID);
}

return results;
}

/**
* Scrape all entry scrapers to transform data source payloads into a PaperEntity list.
* @param payloads - data source payloads.
* @returns List of paper entities. */
@processing(ProcessingKey.General)
@errorcatching("Failed to scrape entry.", true, "ScrapeService", [])
async _fuzzyScrape(
paperEntities: IPaperEntityCollection
): Promise<PaperEntity[][]> {
if (this._hookService.hasHook("beforeFuzzyScrape")) {
[paperEntities] = await this._hookService.modifyHookPoint(
"beforeFuzzyScrape",
5000,
paperEntities
);
}

let paperEntityDraftCandidates: PaperEntity[][] = [];
if (this._hookService.hasHook("fuzzyScrapeMetadata")) {
paperEntityDraftCandidates = await this._hookService.transformhookPoint<
any[],
Object[]
>(
"fuzzyScrapeMetadata",
600000, // 10 min
paperEntities
);
paperEntityDraftCandidates.forEach((p) => {
return p.map((p) => {
return new PaperEntity(p);
});
});
}

if (this._hookService.hasHook("afterScrapeEntry")) {
[paperEntityDraftCandidates] = await this._hookService.modifyHookPoint(
"afterScrapeEntry",
5000,
paperEntityDraftCandidates
);

paperEntityDraftCandidates.forEach((p) => {
return p.map((p) => {
return new PaperEntity(p);
});
});
}

return paperEntityDraftCandidates;
}
}
8 changes: 8 additions & 0 deletions app/renderer/services/uistate-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ export interface IUIStateServiceState {
selectedQuerySentenceIds: string[];
selectedFeed: string;

showingCandidatesId: string;
metadataCandidates: Record<string, PaperEntity[]>;

editingPaperSmartFilter: PaperSmartFilter;

querySentencesSidebar: Array<string>;
Expand Down Expand Up @@ -85,6 +88,9 @@ export class UIStateService extends Eventable<IUIStateServiceState> {
selectedFeed: "feed-all",
dragingIds: [],

showingCandidatesId: "",
metadataCandidates: {},

querySentencesSidebar: [],
querySentenceCommandbar: "",

Expand Down Expand Up @@ -182,6 +188,8 @@ export class UIStateService extends Eventable<IUIStateServiceState> {
feedEditViewShown: false,
paperSmartFilterEditViewShown: false,
deleteConfirmShown: false,
overlayNoticationShown: false,
candidatesViewShown: false,
renderRequired: -1,
feedEntityAddingStatus: 0,
selectedIndex: [],
Expand Down
Loading

0 comments on commit 9c97327

Please sign in to comment.