Skip to content

Softnote import script

jaap-karssenberg edited this page Oct 25, 2013 · 1 revision

The following script can be used to convert XML exported from softnote into a zim notebook. It does not support all special characters in the RTF data generated by softnote, so some manual editing of the XML may be required for the script to run successfully.

file "softnote2zim.py":

#!/usr/bin/python

# -*- coding: utf-8 -*-

# Copyright 2011 Jaap Karssenberg <[email protected]>

# Simple script to convert softnote XML to a zim notebook folder
# Writen as a quick hack, so quality of results may vary

# This script needs pyth, see http://pypi.python.org/pypi/pyth/

# TODO:
# * Looks like we loose strike formatting - blame pyht, other parser available for rtf ?
# * Nested formatting not supported by zim, but we output it anyway


import os
import sys
import re
sys.path.append('./pyth-0.5.6/')

from xml.etree import ElementTree
from StringIO import StringIO

from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.xhtml.writer import XHTMLWriter

from zim.fs import Dir, TmpFile
from zim.notebook import Notebook, Path

import zim.stores.xml
import zim.stores.files


def parse(data):
	'''Converts softnote xml to xml representing a zim notebook'''
	# Pre-parse invalid XML
	# exploit the fact that softnote XML is nicely line based
	# per tag and tags do not contain any '&'
	xml = ''
	for line in data.splitlines():
		if line.startswith('<?xml'):
			xml += line + '\n'
			continue

		line = line.replace('&', '&amp;')
		match = re.match('^<(/?\w+)>', line) # match xml tag at start of line
		if match:
			tag = match.group(1)
			l = len(tag) + 2
			start = line[:l]
			line = line[l:]
			match = re.search('</%s>$' % tag, line)
			if match: # also tag on end of line
				end = line[-l-1:]
				line = line[:-l-1]
			else:
				end = ''
		else:
			start = ''
			end = ''

		xml += start + line.replace('<', '&lt;').replace('>', '&gt;') + end + '\n'

	# Parse XML
	#~ open('intermediate.xml', 'w').write(xml) # DEBUG
	tree = ElementTree.fromstring(xml)
	notebook = ElementTree.Element('section')

	categories = {} # top level folders by name
	pages = {} # pages by id

	for xrecord in tree.findall('XRECORDDATA'):
		#~ print 'FOUND:', map(xrecord.findtext, ('XCATALOG', 'XSUBJECT', 'XID', 'XPARENT'))

		parentid = xrecord.findtext('XPARENT')
		if parentid == '0':
			# we found a top node within category
			category = xrecord.findtext('XCATALOG')
			if not category in categories:
				# first time we see this category
				el = ElementTree.Element('page', {'name': category})
				el.tail = '\n'
				notebook.append(el)
				categories[category] = el
			parent = categories[category]
		else:
			# some sub-note
			assert parentid in pages, 'Found sub-note before parent :('
			parent = pages[parentid]

		title = xrecord.findtext('XSUBJECT')
		name = title.replace(':', ' ') # will confuse hierarchy
		name = Notebook.cleanup_pathname(title, purge=True) # make a valid name
		el = ElementTree.Element('page', {'name': name})
		el.tail = '\n'
		parent.append(el)

		id = xrecord.findtext('XID')
		pages[id] = el

		el.text = convert_rtf(xrecord.findtext('XBODY'))

	return ElementTree.tostring(notebook)

def convert_rtf(rtf):
	'''Converts rtf to zim wiki text'''
	print "DECODING >>>\n", rtf, '<<<\n'
	doc = Rtf15Reader.read(StringIO(rtf))
	html = XHTMLWriter.write(doc, pretty=True).read()
	return convert_html(html)

def convert_html(html):
	'''Converts html to zim wiki text'''
	#~ print "GOT HTML:\n", html
	tree = ElementTree.fromstring(html)
	text = _serialize_html(tree)
	#~ print "MADE TEXT:\n", text
	return text

def _serialize_html(tree):
	text = tree.text or ''
	for el in tree:
		if el.tag == 'strong':
			text += "**" + _serialize_html(el) + "**"
		elif el.tag == 'em':
			text += "//" + _serialize_html(el) + "//"
		elif el.tag == 'u':
			text += "__" + _serialize_html(el) + "__"
		elif el.tag == 'strike':
			text += "~~" + _serialize_html(el) + "~~"
		else:
			text += _serialize_html(el)
		text += el.tail or ''
	return text


def dump(xml, folder):
	'''Takes zim notebook in XML format and dump to file structure'''
	sourcefile = TmpFile('softnote2zim-tmp')
	sourcefile.write(xml)
	source = zim.stores.xml.Store(FakeNotebook(), Path(':'), file=sourcefile)

	target = zim.stores.files.Store(FakeNotebook(), Path(':'), dir=Dir(folder))

	for s_page in source.walk():
		text = source.get_node(s_page).text
		#~ print 'PAGE:', s_page.name
		#~ print text

		t_page = target.get_page(s_page)
		assert not t_page.source.exists(), 'Don\'t want to overwrite %s' % t_page.source.path
		print 'Writing:', t_page.source.path
		t_page.source.write(text)


class FakeNotebook(object):

	if os.name == 'nt': # Windows
		endofline = 'dos'
	else:
		endofline = 'unix'



if __name__ == '__main__':
	if len(sys.argv) == 3:
		input = sys.argv[1]
		xml = parse(open(input).read())
		#~ print xml
		dump(xml, sys.argv[2])
	else:
		print 'Usage: softnote2zim.py SOFTNOTE_XML OUTPUT_FOLDER'
		print 'output folder should be a new empty folder'
Clone this wiki locally