openzim · benoit74 · Nov 2, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 30, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,10 +12,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 - Small fixes in invoke tasks
+- Force Python version to 3.11 (3.12 is not yet ready in our dependencies)
 
 ### Changed
+- Dockerfile: split installation of Python dependencies for more efficiency
+- Github workflow: publish `dev` tag on every push to `main` branch
+- Github workflow: build Docker image + test its startup
+- Github workflow: adopt new standard execution structure (`on` conditions)
 - Scraper (Python code) has been moved to the scraper subfolder
-- Vue.JS is now used as main UI framework ; all its code is in the zimui subfolder ; it is rendered with Vite to produce a static website
+- Vue.JS is now used as main UI framework
+  - all its code is in the zimui subfolder
+  - it is rendered with Vite to produce a static website
+  - developpers instruction have been adapted
 - QA and Tests workflows have been adapted
     - to the new folder structure
     - to also QA and Test the Vue.JS part
@@ -26,9 +34,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
     - these files are consumed by the Vue.JS UI
     - content (video, audio, pdf, epub, ...) is still rendered by Jinja2 as before
 - URLs are meaningful slugs
+    - permalink based on Kolibri node title + 4 chars from node ID
     - generated by Python slugify lib
-    - from Kolibri node title
-    - should two distinct nodes have the same title resulting in the same slug, conflicts are handled with a _1, _2, ... suffix
 - changes in the ZIM "folder" structure:
     -  files generated by Vite are placed in /
     -  thumbnails are placed in /thumbnails
@@ -39,10 +46,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - one new CLI argument --zimui-dist to specify the folder where zimui has been built (by Vite)
 
 
-- Dockerfile: split installation of Python dependencies for more efficiency
-- Github workflow: publish `dev` tag on every push to `main` branch
-- Github workflow: build Docker image + test its startup
-- Github workflow: adopt new standard execution structure (`on` conditions)
 
 ## [1.1.0] - 2023-07-25
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -25,12 +25,44 @@ To add a new locale (`fr` in this example, use only ISO-639-1):
 3. translate the PO file ([poedit](https://poedit.net/) is your friend)
 4. compile updated translation `pybabel compile -d kolibri2zim/locale -l fr`
 
-## releasing
-
-* Update your dependencies: `pip install -U setuptools wheel twine`
-* Make sure CHANGELOG is up-to-date
-* Bump version on `kolibri2zim/VERSION`
-* Build packages `python ./setup.py sdist bdist_wheel`
-* Upload to PyPI `python -m twine upload dist/kolibri2zim-1.0.0*`.
-* Commit your CHANGELOG + version bump changes
-* Tag version on git `git tag -a v1.0.0`
+
+## Developing the ZIM UI in Vue.JS
+
+Sometimes you need to alter something in the ZIM UI in Vue.JS but for this to work, you need assets which are generated by the scraper (e.g. channel.json, ...).
+
+To simplify this, it is possible to:
+- run the scraper (with original code base or your modified one)
+- extract assets from generated files and place them in a directory where ZIM UI will find them
+- iterate on ZIM UI code
+
+To achieve this, first build the Docker image based on current code base.
+
+```
+docker build -t local-kolibri2zim .
+```
+
+Scrape a channel (here we use the minimal channel, but you could use any other one of interest for your UI developments).
+
+```
+docker run --rm -it -v $PWD/output:/output local-kolibri2zim kolibri2zim --name "minimal_test" --title "Minimal Kolibri Channel Test" --description "This is a minimal K
+olibri Channel, with new Kolibri UI" --channel-id "7f744ce8d28b471eaf663abd60c92267" --zim-file "Minimal_Test.zim"
+```
+
+Extract interesting ZIM content and move it to `public` folder.
+
+```
+find zimui/public/ -mindepth 1 -maxdepth 1 ! -name ".gitignore" -delete
+docker run -it --rm -v $(pwd)/output:/data ghcr.io/openzim/zim-tools:latest zimdump dump --dir=/data/Minimal_Test /data/Minimal_Test.zim
+sudo chown -R $(id -u -n):$(id -g -n) output/Minimal_Test
+mv output/Minimal_Test/* zimui/public/
+rm -rf output/Minimal_Test
+```
+
+Start ZIM UI locally.
+
+```
+cd zimui
+yarn dev
+```
+
+Do not forget to cleanup `public` folder before building the docker image again, otherwise all assets will be pushed to the ZIM.
diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
 name = "kolibri2zim"
 authors = [{ name = "Kiwix", email = "[email protected]" }]
 keywords = ["kiwix", "zim", "offline", "kolibri"]
-requires-python = ">=3.11"
+requires-python = ">=3.11,<3.12"
 description = "Make ZIM file from Kolibri Channels"
 readme = "../README.md"
 license = { text = "GPL-3.0-or-later" }

diff --git a/scraper/src/kolibri2zim/schemas.py b/scraper/src/kolibri2zim/schemas.py
@@ -31,6 +31,13 @@ class TopicSection(CamelModel):
     subsections: list[TopicSubSection]
 
 
+class TopicParent(CamelModel):
+    """Information about a parent of one Kolibri topic"""
+
+    slug: str
+    title: str
+
+
 class Topic(CamelModel):
     """Class to serialize data about one Kolibri topic
 
@@ -39,7 +46,7 @@ class Topic(CamelModel):
     to current UI needs
     """
 
-    parents_slugs: list[str]
+    parents: list[TopicParent]
     title: str
     description: str
     sections: list[TopicSection]

diff --git a/scraper/src/kolibri2zim/scraper.py b/scraper/src/kolibri2zim/scraper.py
@@ -41,7 +41,13 @@
     get_size_and_mime,
     safer_reencode,
 )
-from kolibri2zim.schemas import Channel, Topic, TopicSection, TopicSubSection
+from kolibri2zim.schemas import (
+    Channel,
+    Topic,
+    TopicParent,
+    TopicSection,
+    TopicSubSection,
+)
 
 logger = get_logger()
 options = [
@@ -196,29 +202,53 @@ def get_or_create_node_slug(self, node) -> str:
         """Compute a unique slug to be used as URL for a given node"""
         if node["id"] in self.nodes_ids_to_slugs:
             return self.nodes_ids_to_slugs[node["id"]]
-        slug = slugify(str(node.get("title", node["id"])))
+        if "title" in node:
+            slug = f"{slugify(node['title'])}-{node['id'][:4]}"
+        else:
+            slug = node["id"]
         if slug in self.nodes_ids_to_slugs.values():
-            suffix = 1
-            while True:
-                if f"{slug}_{suffix}" not in self.nodes_ids_to_slugs.values():
-                    break
-                suffix += 1
-            slug = f"{slug}_{suffix}"
+            # detect extreme case where we have a conflict
+            conflicting_node_id = {
+                slug: node_id for node_id, slug in self.nodes_ids_to_slugs.items()
+            }[slug]
+            logger.error(
+                f"Slug conflict detected between node {conflicting_node_id} and node"
+                f" {node['id']}, both have same slug {slug}"
+            )
+            raise Exception("Slug conflict, cannot proceed any further")
         self.nodes_ids_to_slugs[node["id"]] = slug
         return slug
 
-    def add_channel_json(self):
+    def get_node_with_slugs(self, node_id, *, with_parents=False, with_children=False):
         node = self.db.get_node(
+            node_id=node_id, with_parents=with_parents, with_children=with_children
+        )
+
+        node["slug"] = self.get_or_create_node_slug(node)
+        if with_parents:
+            # transform generators into list so we can use them multiple times
+            node["parents"] = list(node["parents"])
+            for parent in node["parents"]:
+                parent["slug"] = self.get_or_create_node_slug(parent)
+        if with_children:
+            # transform generators into list so we can use them multiple times
+            node["children"] = list(node["children"])
+            for child in node["children"]:
+                child["slug"] = self.get_or_create_node_slug(child)
+        return node
+
+    def add_channel_json(self):
+        node = self.get_node_with_slugs(
             node_id=self.root_id, with_parents=True, with_children=True
         )
 
         with self.creator_lock:
             self.creator.add_item_for(
                 path="channel.json",
                 title=node["title"],
-                content=Channel(
-                    root_slug=self.get_or_create_node_slug(node)
-                ).model_dump_json(by_alias=True, indent=2),
+                content=Channel(root_slug=node["slug"]).model_dump_json(
+                    by_alias=True, indent=2
+                ),
                 mimetype="application/json",
                 is_front=False,
             )
@@ -342,16 +372,20 @@ def add_topic_node(self, node_id):
         Topic nodes are used only for hierarchy and solely contains metadata"""
 
         # fetch details including parents for breadcrumb and children to link to
-        node = self.db.get_node(node_id=node_id, with_parents=True, with_children=True)
-        node_slug = self.get_or_create_node_slug(node)
+        node = self.get_node_with_slugs(
+            node_id=node_id, with_parents=True, with_children=True
+        )
 
         with self.creator_lock:
             self.creator.add_item_for(
-                path=f"topics/{node_slug}.json",
+                path=f"topics/{node['slug']}.json",
                 title=node["title"],
                 content=Topic(
-                    parents_slugs=[
-                        self.get_or_create_node_slug(parent)
+                    parents=[
+                        TopicParent(
+                            slug=self.get_or_create_node_slug(parent),
+                            title=parent["title"],
+                        )
                         for parent in node["parents"]
                     ],
                     title=node["title"],
@@ -387,7 +421,7 @@ def add_topic_node(self, node_id):
                 mimetype="application/json",
                 is_front=False,
             )
-        logger.debug(f"Added topic #{node_id} - {node_slug}")
+        logger.debug(f"Added topic #{node_id} - {node['slug']}")
 
     def add_video_node(self, node_id):
         """Add content from this `video` node to zim
@@ -491,8 +525,7 @@ def add_video_node(self, node_id):
                 }
             )
 
-        node = self.db.get_node(node_id, with_parents=True)
-        node_slug = self.get_or_create_node_slug(node)
+        node = self.get_node_with_slugs(node_id, with_parents=True)
         html = self.jinja2_env.get_template("video.html").render(
             node_id=node_id,
             video_filename=video_filename,
@@ -504,13 +537,13 @@ def add_video_node(self, node_id):
         )
         with self.creator_lock:
             self.creator.add_item_for(
-                path=f"files/{node_slug}",
+                path=f"files/{node['slug']}",
                 title=node["title"],
                 content=html,
                 mimetype="text/html",
                 is_front=True,
             )
-        logger.debug(f"Added video #{node_id} - {node_slug}")
+        logger.debug(f"Added video #{node_id} - {node['slug']}")
 
     def add_video_upon_completion(self, future):
         """adds the converted video inside this future to the zim
@@ -597,8 +630,7 @@ def add_audio_node(self, node_id):
             return
         self.funnel_file(file["id"], file["ext"])
 
-        node = self.db.get_node(node_id, with_parents=True)
-        node_slug = self.get_or_create_node_slug(node)
+        node = self.get_node_with_slugs(node_id, with_parents=True)
         html = self.jinja2_env.get_template("audio.html").render(
             node_id=node_id,
             filename=filename_for(file),
@@ -609,13 +641,13 @@ def add_audio_node(self, node_id):
         )
         with self.creator_lock:
             self.creator.add_item_for(
-                path=f"files/{node_slug}",
+                path=f"files/{node['slug']}",
                 title=node["title"],
                 content=html,
                 mimetype="text/html",
                 is_front=True,
             )
-        logger.debug(f"Added audio #{node_id} - {node_slug}")
+        logger.debug(f"Added audio #{node_id} - {node['slug']}")
 
     def add_exercise_node(self, node_id):
         """Add content from this `exercise` node to zim
@@ -662,8 +694,7 @@ def add_exercise_node(self, node_id):
                 )
                 assessment_items.append(perseus_content)
 
-        node = self.db.get_node(node_id, with_parents=True, with_children=False)
-        node_slug = self.get_or_create_node_slug(node)
+        node = self.get_node_with_slugs(node_id, with_parents=True, with_children=False)
 
         # add all support files to ZIM
         for ark_member in zip_ark.namelist():
@@ -689,13 +720,13 @@ def add_exercise_node(self, node_id):
         )
         with self.creator_lock:
             self.creator.add_item_for(
-                path=f"files/{node_slug}",
+                path=f"files/{node['slug']}",
                 title=node["title"],
                 content=html,
                 mimetype="text/html",
                 is_front=True,
             )
-        logger.debug(f"Added exercise node #{node_id} - {node_slug}")
+        logger.debug(f"Added exercise node #{node_id} - {node['slug']}")
 
     def add_document_node(self, node_id):
         """Add content from this `document` node to zim
@@ -744,8 +775,7 @@ def get_is_epub(file):
             self.funnel_file(file["id"], file["ext"], path_prefix="files/")
             file["target"] = target_for(file)
 
-        node = self.db.get_node(node_id, with_parents=True)
-        node_slug = self.get_or_create_node_slug(node)
+        node = self.get_node_with_slugs(node_id, with_parents=True)
 
         # convert generator to list as we might read it twice
         node["parents"] = list(node["parents"])
@@ -759,7 +789,7 @@ def get_is_epub(file):
         for is_alt in options:
             html = self.jinja2_env.get_template("document.html").render(
                 node_id=node_id,
-                node_slug=node_slug,
+                node_slug=node["slug"],
                 main_document=filename_for(main_document),
                 main_document_ext=main_document["ext"],
                 alt_document=filename_for(alt_document) if alt_document else None,
@@ -770,7 +800,7 @@ def get_is_epub(file):
                 **node,
             )
             with self.creator_lock:
-                path = f"files/{node_slug}"
+                path = f"files/{node['slug']}"
                 if is_alt:
                     path += "_alt"
                 self.creator.add_item_for(
@@ -780,7 +810,7 @@ def get_is_epub(file):
                     mimetype="text/html",
                     is_front=is_alt,
                 )
-        logger.debug(f"Added document #{node_id} - {node_slug}")
+        logger.debug(f"Added document #{node_id} - {node['slug']}")
 
     def add_html5_node(self, node_id):
         """Add content from this `html5` node to zim
@@ -797,8 +827,7 @@ def add_html5_node(self, node_id):
         if not file:
             return
 
-        node = self.db.get_node(node_id)
-        node_slug = self.get_or_create_node_slug(node)
+        node = self.get_node_with_slugs(node_id)
 
         # download ZIP file to memory
         ark_url, ark_name = get_kolibri_url_for(file["id"], file["ext"])
@@ -811,9 +840,9 @@ def add_html5_node(self, node_id):
             if not self.dedup_html_files:
                 with self.creator_lock:
                     self.creator.add_item_for(
-                        path=f"files/{node_slug}/{ark_member}"
+                        path=f"files/{node['slug']}/{ark_member}"
                         if ark_member != "index.html"
-                        else f"files/{node_slug}",
+                        else f"files/{node['slug']}",
                         content=zip_ark.open(ark_member).read(),
                         is_front=(ark_member == "index.html"),
                     )
@@ -835,14 +864,14 @@ def add_html5_node(self, node_id):
             # add redirect to the unique sum-based entry for that file's path
             with self.creator_lock:
                 self.creator.add_redirect(
-                    path=f"files/{node_slug}/{ark_member}"
+                    path=f"files/{node['slug']}/{ark_member}"
                     if ark_member != "index.html"
-                    else f"files/{node_slug}",
+                    else f"files/{node['slug']}",
                     target_path=f"html5_files/{content_hash}",
                     is_front=ark_member == "index.html",
                 )
 
-        logger.debug(f"Added HTML5 node #{node_id} - {node_slug}")
+        logger.debug(f"Added HTML5 node #{node_id} - {node['slug']}")
 
     def run(self):
         if self.s3_url_with_credentials and not self.s3_credentials_ok():