-
Notifications
You must be signed in to change notification settings - Fork 71
Softnote import script
jaap-karssenberg edited this page Oct 25, 2013
·
1 revision
The following script can be used to convert XML exported from softnote into a zim notebook. It does not support all special characters in the RTF data generated by softnote, so some manual editing of the XML may be required for the script to run successfully.
file "softnote2zim.py":
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright 2011 Jaap Karssenberg <[email protected]>
# Simple script to convert softnote XML to a zim notebook folder
# Writen as a quick hack, so quality of results may vary
# This script needs pyth, see http://pypi.python.org/pypi/pyth/
# TODO:
# * Looks like we loose strike formatting - blame pyht, other parser available for rtf ?
# * Nested formatting not supported by zim, but we output it anyway
import os
import sys
import re
sys.path.append('./pyth-0.5.6/')
from xml.etree import ElementTree
from StringIO import StringIO
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.xhtml.writer import XHTMLWriter
from zim.fs import Dir, TmpFile
from zim.notebook import Notebook, Path
import zim.stores.xml
import zim.stores.files
def parse(data):
'''Converts softnote xml to xml representing a zim notebook'''
# Pre-parse invalid XML
# exploit the fact that softnote XML is nicely line based
# per tag and tags do not contain any '&'
xml = ''
for line in data.splitlines():
if line.startswith('<?xml'):
xml += line + '\n'
continue
line = line.replace('&', '&')
match = re.match('^<(/?\w+)>', line) # match xml tag at start of line
if match:
tag = match.group(1)
l = len(tag) + 2
start = line[:l]
line = line[l:]
match = re.search('</%s>$' % tag, line)
if match: # also tag on end of line
end = line[-l-1:]
line = line[:-l-1]
else:
end = ''
else:
start = ''
end = ''
xml += start + line.replace('<', '<').replace('>', '>') + end + '\n'
# Parse XML
#~ open('intermediate.xml', 'w').write(xml) # DEBUG
tree = ElementTree.fromstring(xml)
notebook = ElementTree.Element('section')
categories = {} # top level folders by name
pages = {} # pages by id
for xrecord in tree.findall('XRECORDDATA'):
#~ print 'FOUND:', map(xrecord.findtext, ('XCATALOG', 'XSUBJECT', 'XID', 'XPARENT'))
parentid = xrecord.findtext('XPARENT')
if parentid == '0':
# we found a top node within category
category = xrecord.findtext('XCATALOG')
if not category in categories:
# first time we see this category
el = ElementTree.Element('page', {'name': category})
el.tail = '\n'
notebook.append(el)
categories[category] = el
parent = categories[category]
else:
# some sub-note
assert parentid in pages, 'Found sub-note before parent :('
parent = pages[parentid]
title = xrecord.findtext('XSUBJECT')
name = title.replace(':', ' ') # will confuse hierarchy
name = Notebook.cleanup_pathname(title, purge=True) # make a valid name
el = ElementTree.Element('page', {'name': name})
el.tail = '\n'
parent.append(el)
id = xrecord.findtext('XID')
pages[id] = el
el.text = convert_rtf(xrecord.findtext('XBODY'))
return ElementTree.tostring(notebook)
def convert_rtf(rtf):
'''Converts rtf to zim wiki text'''
print "DECODING >>>\n", rtf, '<<<\n'
doc = Rtf15Reader.read(StringIO(rtf))
html = XHTMLWriter.write(doc, pretty=True).read()
return convert_html(html)
def convert_html(html):
'''Converts html to zim wiki text'''
#~ print "GOT HTML:\n", html
tree = ElementTree.fromstring(html)
text = _serialize_html(tree)
#~ print "MADE TEXT:\n", text
return text
def _serialize_html(tree):
text = tree.text or ''
for el in tree:
if el.tag == 'strong':
text += "**" + _serialize_html(el) + "**"
elif el.tag == 'em':
text += "//" + _serialize_html(el) + "//"
elif el.tag == 'u':
text += "__" + _serialize_html(el) + "__"
elif el.tag == 'strike':
text += "~~" + _serialize_html(el) + "~~"
else:
text += _serialize_html(el)
text += el.tail or ''
return text
def dump(xml, folder):
'''Takes zim notebook in XML format and dump to file structure'''
sourcefile = TmpFile('softnote2zim-tmp')
sourcefile.write(xml)
source = zim.stores.xml.Store(FakeNotebook(), Path(':'), file=sourcefile)
target = zim.stores.files.Store(FakeNotebook(), Path(':'), dir=Dir(folder))
for s_page in source.walk():
text = source.get_node(s_page).text
#~ print 'PAGE:', s_page.name
#~ print text
t_page = target.get_page(s_page)
assert not t_page.source.exists(), 'Don\'t want to overwrite %s' % t_page.source.path
print 'Writing:', t_page.source.path
t_page.source.write(text)
class FakeNotebook(object):
if os.name == 'nt': # Windows
endofline = 'dos'
else:
endofline = 'unix'
if __name__ == '__main__':
if len(sys.argv) == 3:
input = sys.argv[1]
xml = parse(open(input).read())
#~ print xml
dump(xml, sys.argv[2])
else:
print 'Usage: softnote2zim.py SOFTNOTE_XML OUTPUT_FOLDER'
print 'output folder should be a new empty folder'