Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add facility to extract and set ZIM metadata #1133

Merged
merged 6 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/unit/spec/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ var runTests = function () {
QUnit.module('zim_direntry_search_and_read');
QUnit.test("check DirEntry.fromStringId 'A Fool for You'", function (assert) {
var done = assert.async();
var aFoolForYouDirEntry = zimDirEntry.DirEntry.fromStringId(localZimArchive._file, '5856|7|A|0|2|A_Fool_for_You.html|A Fool for You|false|undefined');
var aFoolForYouDirEntry = zimDirEntry.DirEntry.fromStringId(localZimArchive.file, '5856|7|A|0|2|A_Fool_for_You.html|A Fool for You|false|undefined');
Rishabhg71 marked this conversation as resolved.
Show resolved Hide resolved

assert.expect(2);
var callbackFunction = function (dirEntry, htmlArticle) {
Expand Down
32 changes: 18 additions & 14 deletions www/js/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -1232,11 +1232,8 @@ function setLocalArchiveFromArchiveList () {
}
}
resetCssCache();
selectedArchive = zimArchiveLoader.loadArchiveFromDeviceStorage(selectedStorage, archiveDirectory, function () {
settingsStore.setItem('lastSelectedArchive', archiveDirectory, Infinity);
// The archive is set : go back to home page to start searching
document.getElementById('btnHome').click();
}, function (message, label) {
settingsStore.setItem('lastSelectedArchive', archiveDirectory, Infinity);
zimArchiveLoader.loadArchiveFromDeviceStorage(selectedStorage, archiveDirectory, archiveReadyCallback, function (message, label) {
// callbackError which is called in case of an error
uiUtil.systemAlert(message, label);
});
Expand Down Expand Up @@ -1339,17 +1336,24 @@ function setLocalArchiveFromFileList (files) {
}
}
resetCssCache();
selectedArchive = null;
selectedArchive = zimArchiveLoader.loadArchiveFromFiles(files, function () {
// The archive is set : go back to home page to start searching
document.getElementById('btnHome').click();
document.getElementById('downloadInstruction').style.display = 'none';
}, function (message, label) {
zimArchiveLoader.loadArchiveFromFiles(files, archiveReadyCallback, function (message, label) {
// callbackError which is called in case of an error
uiUtil.systemAlert(message, label);
});
}

/**
* Functions to be run immediately after the archive is loaded
*
* @param {ZIMArchive} archive The ZIM archive
*/
function archiveReadyCallback (archive) {
selectedArchive = archive;
// The archive is set: go back to home page to start searching
document.getElementById('btnHome').click();
document.getElementById('downloadInstruction').style.display = 'none';
}

/**
* Sets the localArchive from the File selects populated by user
*/
Expand Down Expand Up @@ -1654,7 +1658,7 @@ function readArticle (dirEntry) {
}

// We put the ZIM filename as a prefix in the URL, so that browser caches are separate for each ZIM file
iframeArticleContent.src = '../' + selectedArchive._file.name + '/' + dirEntry.namespace + '/' + encodedUrl;
iframeArticleContent.src = '../' + selectedArchive.file.name + '/' + dirEntry.namespace + '/' + encodedUrl;
} else {
// In jQuery mode, we read the article content in the backend and manually insert it in the iframe
if (dirEntry.isRedirect()) {
Expand Down Expand Up @@ -2183,7 +2187,7 @@ function goToRandomArticle () {
// We fall back to the old A namespace to support old ZIM files without a text/html MIME type for articles
// DEV: If articlePtrPos is defined in zimFile, then we are using a v1 article-only title listing. By definition,
// all dirEntries in an article-only listing must be articles.
if (selectedArchive._file.articlePtrPos || dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
if (selectedArchive.file.articlePtrPos || dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
params.isLandingPage = false;
var activeContent = document.getElementById('activeContent');
if (activeContent) activeContent.style.display = 'none';
Expand Down Expand Up @@ -2214,7 +2218,7 @@ function goToMainArticle () {
document.getElementById('welcomeText').style.display = '';
} else {
// For now, this code doesn't support reading Zimit archives without error, so we warn the user and suggest some solutions
if (selectedArchive._file.zimType === 'zimit') {
if (selectedArchive.zimType === 'zimit') {
uiUtil.systemAlert(translateUI.t('dialog-unsupported-archivetype-message') || '<p>You are attempting to open a Zimit-style archive, which is currently unsupported in this app.</p>' +
'<p>There is experimental support for this kind of archive in the Kiwix JS PWA. Go to: ' +
'<a href="https://pwa.kiwix.org" target="_blank">https://pwa.kiwix.org</a>.</p>' +
Expand Down
134 changes: 92 additions & 42 deletions www/js/lib/zimArchive.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* zimArchive.js: Support for archives in ZIM format.
*
* Copyright 2015 Mossroy and contributors
* Copyright 2015-2023 Mossroy, Jaifroid and contributors
* Licence GPL v3:
*
* This file is part of Kiwix.
Expand Down Expand Up @@ -33,10 +33,17 @@ import utf8 from './utf8.js';
/**
* ZIM Archive
*
*
* @typedef ZIMArchive
* @property {ZIMFile} _file The ZIM file (instance of ZIMFile, that might physically be split into several actual files)
* @property {String} _language Language of the content
Rishabhg71 marked this conversation as resolved.
Show resolved Hide resolved
* @property {ZIMFile} file The ZIM file (instance of ZIMFile, that might physically be split into several actual _files)
* @property {String} counter Counter of various types of content in the archive
* @property {String} creator Creator of the content
* @property {String} date Date of the creation of the archive
* @property {String} description Description of the content
* @property {String} language Language of the content
* @property {String} name Name of the archive
* @property {String} publisher Publisher of the content
* @property {String} title Title of the content
* @property {String} zimType Extended property: currently either 'open' for OpenZIM file type, or 'zimit' for the warc2zim file type used by Zimit
*/

/**
Expand Down Expand Up @@ -65,17 +72,16 @@ var LZ;
*/
function ZIMArchive (storage, path, callbackReady, callbackError) {
var that = this;
that._file = null;
that._language = ''; // @TODO
that.file = null;
var createZimfile = function (fileArray) {
zimfile.fromFileArray(fileArray).then(function (file) {
that._file = file;
that.file = file;
// Clear the previous libzimWoker
LZ = null;
// Set a global parameter to report the search provider type
params.searchProvider = 'title';
// File has been created, but we need to add any Listings which extend the archive metadata
that._file.setListings([
that.file.setListings([
// Provide here any Listings for which we need to extract metadata as key:value obects to be added to the file
// 'ptrName' and 'countName' contain the key names to be set in the archive file object
{
Expand All @@ -99,14 +105,14 @@ function ZIMArchive (storage, path, callbackReady, callbackError) {
}
]).then(function () {
// There is currently an exception thrown in the libzim wasm if we attempt to load a split ZIM archive, so we work around
var isSplitZim = /\.zima.$/i.test(that._file._files[0].name);
if (that._file.fullTextIndex && (params.debugLibzimASM || !isSplitZim && typeof Atomics !== 'undefined' &&
var isSplitZim = /\.zima.$/i.test(that.file._files[0].name);
if (that.file.fullTextIndex && (params.debugLibzimASM || !isSplitZim && typeof Atomics !== 'undefined' &&
// Note that Android and NWJS currently throw due to problems with Web Worker context
!/Android/.test(params.appType) && !(window.nw && that._file._files[0].readMode === 'electron'))) {
!/Android/.test(params.appType) && !(window.nw && that.file._files[0].readMode === 'electron'))) {
var libzimReaderType = params.debugLibzimASM || ('WebAssembly' in self ? 'wasm' : 'asm');
console.log('Instantiating libzim ' + libzimReaderType + ' Web Worker...');
LZ = new Worker('js/lib/libzim-' + libzimReaderType + '.js');
that.callLibzimWorker({ action: 'init', files: that._file._files }).then(function (msg) {
that.callLibzimWorker({ action: 'init', files: that.file._files }).then(function (msg) {
// console.debug(msg);
params.searchProvider = 'fulltext: ' + libzimReaderType;
// Update the API panel
Expand All @@ -117,25 +123,52 @@ function ZIMArchive (storage, path, callbackReady, callbackError) {
});
} else {
// var message = 'Full text searching is not available because ';
if (!that._file.fullTextIndex) {
if (!that.file.fullTextIndex) {
params.searchProvider += ': no_fulltext'; // message += 'this ZIM does not have a full-text index.';
} else if (isSplitZim) {
params.searchProvider += ': split_zim'; // message += 'the ZIM archive is split.';
} else if (typeof Atomics === 'undefined') {
params.searchProvider += ': no_atomics'; // message += 'this browser does not support Atomic operations.';
} else if (/Android/.test(params.appType)) {
params.searchProvider += ': no_sharedArrayBuffer';
} else if (params.debugLibzimASM === 'disable') {
params.searchProvider += ': disabled';
} else {
params.searchProvider += ': unknown';
}
uiUtil.reportSearchProviderToAPIStatusPanel(params.searchProvider);
}
// Set the archive file type ('open' or 'zimit')
that.setZimType();
// Add time-critical metadata from the M/ namespace that you need early access to here
// Note that adding metadata here delays the reporting of the ZIM archive as ready
// Further metadata are added in the background below, and can be accessed later
Promise.all([
that.addMetadataToZIMFile('Creator'),
that.addMetadataToZIMFile('Language')
]).then(function () {
console.debug('ZIMArchive ready, metadata will be added in the background');
// All listings should be loaded, so we can now call the callback
callbackReady(that);
});
// Add non-time-critical metadata to archive in background so as not to delay opening of the archive
// DEV: Note that it does not make sense to extract illustration (icon) metadata here. Instead, if you implement use of the illustration
// metadata as icons for the loaded ZIM [kiwix-js #886], you should simply use the ZIMArdhive.getMetadata() function when needed
setTimeout(function () {
Promise.all([
that.addMetadataToZIMFile('Counter'),
that.addMetadataToZIMFile('Date'),
that.addMetadataToZIMFile('Description'),
that.addMetadataToZIMFile('Name'),
that.addMetadataToZIMFile('Publisher'),
that.addMetadataToZIMFile('Title')
]).then(function () {
console.debug('ZIMArchive metadata loaded:', that);
});
}, 1500);
}).catch(function (err) {
console.warn('Error setting archive listings: ', err);
});
// Set the archive file type ('open' or 'zimit')
params.zimType = that.setZimType();
// DEV: Currently, extended listings are only used for title (=article) listings when the user searches
// for an article or uses the Random button, by which time the listings will have been extracted.
// If, in the future, listings are used in a more time-critical manner, consider forcing a wait before
// declaring the archive to be ready, by chaining the following callback in a .then() function of setListings.
callbackReady(that);
});
};
if (storage && !path) {
Expand Down Expand Up @@ -189,27 +222,27 @@ ZIMArchive.prototype._searchArchiveParts = function (storage, prefixPath) {
* @returns {Boolean}
*/
ZIMArchive.prototype.isReady = function () {
return this._file !== null;
return this.file !== null;
};

/**
* Detects whether the supplied archive is a Zimit-style archive or an OpenZIM archive and
* sets a _file.zimType property accordingly; also returns the detected type. Extends ZIMFile.
* sets a zimType property accordingly; also returns the detected type. Extends ZIMArchive.
* @returns {String} Either 'zimit' for a Zimit archive, or 'open' for an OpenZIM archive
*/
ZIMArchive.prototype.setZimType = function () {
var fileType = null;
var archiveType = null;
if (this.isReady()) {
fileType = 'open';
this._file.mimeTypes.forEach(function (v) {
if (/warc-headers/i.test(v)) fileType = 'zimit';
archiveType = 'open';
this.file.mimeTypes.forEach(function (v) {
if (/warc-headers/i.test(v)) archiveType = 'zimit';
});
this._file.zimType = fileType;
console.debug('Archive type set to: ' + fileType);
this.zimType = archiveType;
console.debug('Archive type set to: ' + archiveType);
} else {
console.error('ZIMArchive is not ready! Cannot set ZIM type.');
}
return fileType;
return archiveType;
};

/**
Expand All @@ -219,8 +252,8 @@ ZIMArchive.prototype.setZimType = function () {
*/
ZIMArchive.prototype.getMainPageDirEntry = function (callback) {
if (this.isReady()) {
var mainPageUrlIndex = this._file.mainPage;
this._file.dirEntryByUrlIndex(mainPageUrlIndex).then(callback);
var mainPageUrlIndex = this.file.mainPage;
this.file.dirEntryByUrlIndex(mainPageUrlIndex).then(callback);
}
};

Expand All @@ -230,7 +263,7 @@ ZIMArchive.prototype.getMainPageDirEntry = function (callback) {
* @returns {DirEntry}
*/
ZIMArchive.prototype.parseDirEntryId = function (dirEntryId) {
return zimDirEntry.DirEntry.fromStringId(this._file, dirEntryId);
return zimDirEntry.DirEntry.fromStringId(this.file, dirEntryId);
};

/**
Expand Down Expand Up @@ -335,7 +368,7 @@ ZIMArchive.prototype.findDirEntriesWithPrefix = function (search, callback, noIn
ZIMArchive.prototype.getContentNamespace = function () {
var errorText;
if (this.isReady()) {
var ver = this._file.minorVersion;
var ver = this.file.minorVersion;
// DEV: There are currently only two defined values for minorVersion in the OpenZIM specification
// If this changes, adapt the error checking and return values
if (ver > 1) {
Expand All @@ -360,9 +393,9 @@ ZIMArchive.prototype.findDirEntriesWithPrefixCaseSensitive = function (prefix, s
var that = this;
var cns = this.getContentNamespace();
// Search v1 article listing if available, otherwise fallback to v0
var articleCount = this._file.articleCount || this._file.entryCount;
var articleCount = this.file.articleCount || this.file.entryCount;
util.binarySearch(0, articleCount, function (i) {
return that._file.dirEntryByTitleIndex(i).then(function (dirEntry) {
return that.file.dirEntryByTitleIndex(i).then(function (dirEntry) {
if (search.status === 'cancelled') return 0;
var ns = dirEntry.namespace;
// DEV: This search is redundant if we managed to populate articlePtrLst and articleCount, but it only takes two instructions and
Expand All @@ -387,7 +420,7 @@ ZIMArchive.prototype.findDirEntriesWithPrefixCaseSensitive = function (prefix, s
nextStart: index
};
}
return that._file.dirEntryByTitleIndex(index).then(function (dirEntry) {
return that.file.dirEntryByTitleIndex(index).then(function (dirEntry) {
search.scanCount++;
var title = dirEntry.getTitleOrUrl();
// Only return dirEntries with titles that actually begin with prefix
Expand Down Expand Up @@ -488,7 +521,7 @@ ZIMArchive.prototype.callLibzimWorker = function (parameters) {
* @param {callbackDirEntry} callback
*/
ZIMArchive.prototype.resolveRedirect = function (dirEntry, callback) {
this._file.dirEntryByUrlIndex(dirEntry.redirectTarget).then(callback);
this.file.dirEntryByUrlIndex(dirEntry.redirectTarget).then(callback);
};

/**
Expand Down Expand Up @@ -530,8 +563,8 @@ ZIMArchive.prototype.readBinaryFile = function (dirEntry, callback) {
*/
ZIMArchive.prototype.getDirEntryByPath = function (path) {
var that = this;
return util.binarySearch(0, this._file.entryCount, function (i) {
return that._file.dirEntryByUrlIndex(i).then(function (dirEntry) {
return util.binarySearch(0, this.file.entryCount, function (i) {
return that.file.dirEntryByUrlIndex(i).then(function (dirEntry) {
var url = dirEntry.namespace + '/' + dirEntry.url;
if (path < url) {
return -1;
Expand All @@ -543,7 +576,7 @@ ZIMArchive.prototype.getDirEntryByPath = function (path) {
});
}).then(function (index) {
if (index === null) return null;
return that._file.dirEntryByUrlIndex(index);
return that.file.dirEntryByUrlIndex(index);
}).then(function (dirEntry) {
return dirEntry;
});
Expand All @@ -555,9 +588,9 @@ ZIMArchive.prototype.getDirEntryByPath = function (path) {
*/
ZIMArchive.prototype.getRandomDirEntry = function (callback) {
// Prefer an article-only (v1) title pointer list, if available
var articleCount = this._file.articleCount || this._file.entryCount;
var articleCount = this.file.articleCount || this.file.entryCount;
var index = Math.floor(Math.random() * articleCount);
this._file.dirEntryByTitleIndex(index).then(callback);
this.file.dirEntryByTitleIndex(index).then(callback);
};

/**
Expand All @@ -582,6 +615,23 @@ ZIMArchive.prototype.getMetadata = function (key, callback) {
});
};

/**
* Add Metadata to the ZIM file
* @param {String} key The key of the metadata to add to the ZIM file
* @returns {Promise<String>} A Promise that resolves with the metadata string, if it exists
*/
ZIMArchive.prototype.addMetadataToZIMFile = function (key) {
var that = this;
var lcaseKey = key.toLocaleLowerCase();
return new Promise(function (resolve, reject) {
that.getMetadata(key, function (data) {
data = data || '';
that[lcaseKey] = data;
resolve(data);
});
});
};

export default {
ZIMArchive: ZIMArchive
};
Loading