generate_registry

#!/usr/bin/env python3

import argparse
import os
import re
import urllib.request
import sys
import xml.etree.ElementTree

import jinja2

# Cache for downloaded Docbook documents
documents = {}

def main():
    parser = argparse.ArgumentParser("Generate registry files")
    parser.add_argument(
        "root", nargs="?",
        default="http://dicom.nema.org/medical/dicom/current/source/docbook",
        help="Root URL or path to the Docbook version of the DICOM standard")

    arguments = parser.parse_args()

    jinja_environment = jinja2.Environment(
        loader=jinja2.FileSystemLoader(os.path.dirname(__file__)),
        trim_blocks=True)

    sources = [
        ("part07/part07.xml", "table_E.1-1"),
        ("part06/part06.xml", "table_7-1"),
        ("part06/part06.xml", "table_8-1"),
        ("part06/part06.xml", "table_6-1"),
    ]

    elements_dictionary = {}
    for url, id_ in sources:
        url = os.path.join(arguments.root, url)
        entries = parse_elements_dictionaries(url, id_)
        for group, group_entries in entries.items():
            elements_dictionary.setdefault(group, []).extend(group_entries)

    sources = [
        ("part06/part06.xml", "table_A-1"),
    ]

    uids = []
    for url, id_ in sources:
        uids.extend(parse_uids(os.path.join(arguments.root, url), id_))

    group_templates = {
        "src/odil/registry_{}.h": jinja_environment.get_template("registry_group.h.tmpl"),
        "src/odil/registry_{}.cpp": jinja_environment.get_template("registry_group.cpp.tmpl"),
    }

    for group, group_items in elements_dictionary.items():
        for path, template in group_templates.items():
            suffix = "{:04x}".format(group) if group != "misc" else group
            with open(path.format(suffix), "w") as fd:
                fd.write(template.render(
                    elements_dictionary=group_items, group=suffix))

    main_templates = {
        "src/odil/registry.h": jinja_environment.get_template("registry.h.tmpl"),
        "src/odil/registry.cpp": jinja_environment.get_template("registry.cpp.tmpl"),
    }
    groups = [
        "{:04x}".format(x) if x != "misc" else x for x in elements_dictionary.keys()]
    for path, template in main_templates.items():
        with open(path, "w") as fd:
            fd.write(template.render(uids=uids, groups=groups))

def get_document(url):
    if url not in documents:
        print("Downloading {}".format(url))
        fd = urllib.request.urlopen(url)
        documents[url] = xml.etree.ElementTree.parse(fd)
        fd.close()
    return documents[url]

def parse_elements_dictionaries(url, id_):
    document = get_document(url)

    namespaces = {
        "xml": "http://www.w3.org/XML/1998/namespace",
        "docbook": "http://docbook.org/ns/docbook"
    }

    table = document.find(".//*[@xml:id=\"{}\"]".format(id_), namespaces)

    entries = {}

    for row in table.iterfind("./docbook:tbody/docbook:tr", namespaces):
        entry = row.findall("./docbook:td/docbook:para", namespaces)

        tag, name, keyword, vr, vm = entry[:5]

        if len(tag):
            tag = tag.find("./docbook:emphasis", namespaces)
        tag = tag.text

        converters = [
            (
                r"\(([0-9a-fA-F]{4}),([0-9a-fA-F]{4})\)",
                lambda x,y: (int(x, 16), int(y, 16))
            ),
            (
                r"\(([0-9a-fA-Fx]{4}),([0-9a-fA-Fx]{4})\)",
                lambda x,y: str(x+y)
            )
        ]
        for expression, converter in converters:
            match = re.match(expression, tag)
            if match:
                tag = converter(*match.groups())
                break

        keyword = get_value(keyword, "ascii", namespaces)
        if not keyword:
            continue

        name = get_value(name, "utf-8", namespaces)
        vr = get_value(vr, "ascii", namespaces)
        vm = get_value(vm, "ascii", namespaces)

        if isinstance(tag, tuple):
            group = tag[0]
        elif isinstance(tag, str):
            group = "misc"
        else:
            raise NotImplementedError("Unknown tag type: ".format(type(tag)))

        entries.setdefault(group, []).append((tag, name, keyword, vr, vm))

    return entries

def parse_uids(url, id_):
    document = get_document(url)

    namespaces = {
        "xml": "http://www.w3.org/XML/1998/namespace",
        "docbook": "http://docbook.org/ns/docbook"
    }

    table = document.find(".//*[@xml:id=\"{}\"]".format(id_), namespaces)

    keywords_map = {
        "12leadECGWaveformStorage": "TwelveleadECGWaveformStorage",
    }
    old_keywords_map = {
        "Verification": "VerificationSOPClass",
    }

    entries = []
    
    keywords = set()

    for row in table.iterfind("./docbook:tbody/docbook:tr", namespaces):
        entry = row.findall("./docbook:td/docbook:para", namespaces)
        uid, name, keyword, type_ = entry[:4]

        uid = get_value(uid, "ascii", namespaces)
        name = get_value(name, "ascii", namespaces)
        keyword = get_value(keyword, "ascii", namespaces)
        type_ = get_value(type_, "ascii", namespaces)

        if name == "(Retired)":
            continue

        keyword = keywords_map.get(keyword, keyword)

        entries.append((uid, name, keyword, type_))
        keywords.add(keyword)
        
        retired = name.endswith(" (Retired)")
        old_keyword = name.replace(" (Retired)", "")
        old_keyword = re.sub(":.*", "", old_keyword)
        old_keyword = re.sub("\W", " ", old_keyword).strip().replace(" ", "")
        old_keywords_map.get(old_keyword, old_keyword)
        if retired:
            old_keyword += "_Retired"
        if old_keyword not in keywords and old_keyword not in keywords_map:
            entries.append((uid, name, old_keyword, type_))

    return entries

def get_value(element, encoding, namespaces):
    emphasis = element.find("./docbook:emphasis", namespaces)
    if emphasis is not None:
        element = emphasis

    element = element.text or ""
    element = element.replace(u"\u200b", "").replace(u"\u00b5", "u")
    return element.strip()

if __name__ == "__main__":
    sys.exit(main())