Skip to content

Commit

Permalink
ci/prepare-documentation.py: factor out the Markdown parsing code int…
Browse files Browse the repository at this point in the history
…o a module

I plan to add more scripts that reuse this logic. Another benefit of this is
that it'll let us switch out the underlying Markdown parser if mistune proves
unsatisfactory.
  • Loading branch information
Roman Donchenko committed Apr 2, 2021
1 parent 3b769af commit afabd42
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 28 deletions.
58 changes: 58 additions & 0 deletions ci/lib/omzdocs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright (c) 2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import collections

import mistune

_parse_markdown = mistune.create_markdown(renderer=mistune.AstRenderer())

def _get_all_ast_nodes(ast_nodes):
for node in ast_nodes:
yield node
if 'children' in node:
# workaround for https://github.com/lepture/mistune/issues/269
if isinstance(node['children'], str):
yield {'type': 'text', 'text': node['children']}
else:
yield from _get_all_ast_nodes(node['children'])

def _get_text_from_ast(ast_nodes):
def get_text_from_node(node):
if node['type'] != 'text':
raise RuntimeError(f'unsupported node type: {node["type"]}')
return node['text']

return ''.join(map(get_text_from_node, ast_nodes))

ExternalReference = collections.namedtuple('ExternalReference', ['type', 'url'])

class DocumentationPage:
def __init__(self, markdown_text):
self._ast = ast = _parse_markdown(markdown_text)

self._title = None
if ast and ast[0]['type'] == 'heading' and ast[0]['level'] == 1:
self._title = _get_text_from_ast(ast[0]['children'])

@property
def title(self):
return self._title

def external_references(self):
for node in _get_all_ast_nodes(self._ast):
if node['type'] == 'image':
yield ExternalReference('image', node['src'])
elif node['type'] == 'link':
yield ExternalReference('link', node['link'])
37 changes: 9 additions & 28 deletions ci/prepare-documentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,21 @@
import logging
import re
import shutil
import sys
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

from pathlib import Path

import mistune
import yaml

OMZ_ROOT = Path(__file__).resolve().parents[1]

sys.path.append(str(OMZ_ROOT / 'ci/lib'))

import omzdocs

XML_ID_ATTRIBUTE = '{http://www.w3.org/XML/1998/namespace}id'

# For most task types, taking the machine-readable form and replacing
Expand All @@ -63,29 +67,6 @@
'text_to_speech': 'Text-to-speech',
}

parse_markdown = mistune.create_markdown(renderer=mistune.AstRenderer())


def get_all_ast_nodes(ast_nodes):
for node in ast_nodes:
yield node
if 'children' in node:
# workaround for https://github.com/lepture/mistune/issues/269
if isinstance(node['children'], str):
yield {'type': 'text', 'text': node['children']}
else:
yield from get_all_ast_nodes(node['children'])


def get_text_from_ast(ast_nodes):
def get_text_from_node(node):
if node['type'] != 'text':
raise RuntimeError(f'unsupported node type: {node["type"]}')
return node['text']

return ''.join(map(get_text_from_node, ast_nodes))


def add_page(output_root, parent, *, id=None, path=None, title=None):
if parent.tag == 'tab':
parent.attrib['type'] = 'usergroup'
Expand All @@ -105,13 +86,13 @@ def add_page(output_root, parent, *, id=None, path=None, title=None):
with (OMZ_ROOT / path).open('r', encoding='utf-8') as input_file:
lines = input_file.readlines()

ast = parse_markdown(''.join(lines))
page = omzdocs.DocumentationPage(''.join(lines))

if not ast or ast[0]['type'] != 'heading' or ast[0]['level'] != 1:
if page.title is None:
raise RuntimeError(f'{path}: must begin with level 1 heading')

if not title:
title = get_text_from_ast(ast[0]['children'])
title = page.title

element.attrib['title'] = title

Expand All @@ -128,7 +109,7 @@ def add_page(output_root, parent, *, id=None, path=None, title=None):
output_file.writelines(lines)

# copy all referenced images
image_urls = [node['src'] for node in get_all_ast_nodes(ast) if node['type'] == 'image']
image_urls = [ref.url for ref in page.external_references() if ref.type == 'image']

for image_url in image_urls:
parsed_image_url = urllib.parse.urlparse(image_url)
Expand Down

0 comments on commit afabd42

Please sign in to comment.