-
Notifications
You must be signed in to change notification settings - Fork 71
Moinmoin import script
As mentioned in the comment of the script, this is a modification of Slim Gaillards moin2doku.py. I've removed some bugs, added the creation of a page header with correct timestamp and modified the formatting regexs. The file attachment conversion isn't removed, but it is neither tested nor adapted to ZIM. Feel free to make modifications or fixes. Just add than to this page.
By the way...
ZIM uses two different markups for verbatim/code blocks. verbatim text segments within the line are marked with ''
while whole blocks are marked with '''
! There is no test allowed after the opening '''
! The script currently (0.3) handles only the first markup. The REGEX based parsing of the script doesn't allow any "deeper inspection". So it isn't easy to add a secure detection of code blocks (nested or mixed markups,..).
file "moin2zim.py":
#!/usr/bin/python
#
# moin2zim.py -- Joerg Desch <joerg DOT desch AT googlemail DOT com>
#
# ... A modification of moin2doku.py by Slim Gaillard
# (see http://www.dokuwiki.org/tips:moinmoin2doku)
#
# IMPORTANT NOTE: this script is more a quick hack than a real project!
#
# "moin2zim" is a script for converting MoinMoin version 1.3+ wiki data to the
# Zim format. It tries to convert all pages and is not intended to convert
# a single page. You have to call it with the name of the directory containing
# the MoinMoin pages as first parameter, and the output directory as second
# parameter.
#
# example: python moin2zim.py ./my-moin/data/pages/ ./zim-import/
#
# In ./zim/import/ are all pages with the zim formatting and the Zim page header.
# You should have on eye on the files and than copy them into the Zim notebook
# directory.
#
# The script doesn't do all the work. Some formattings aren't supported, others
# are not available in Zim. For now, I've only done the stuff I've needed to
# convert my moin wiki.
#
# Missing stuff:
# * CamelCase links without [] are not detected as internal links.
# * verbatim (multiline) blocks (not the marks inside a line) aren't detected.
# The script still uses '' instead of '''!
# * some unsupported stuff is still in DokuWiki syntax.
# * currently only one moin icon is translated
# * attachments are neither tested nor checked for compatibility.
# * much other stuff I'm not aware of... ;-)
#
# version 0.3 (jd) first public release of my modifications.
#
import sys, os, os.path, re, pdb, time
from os import listdir
from os.path import isdir, basename
def check_dirs(moin_pages_dir, output_dir):
if not isdir(moin_pages_dir):
print >> sys.stderr, "MoinMoin pages directory doesn't exist!"
sys.exit(1)
if not isdir(output_dir):
print >> sys.stderr, "Output directory doesn't exist!"
sys.exit(1)
def get_path_names(moin_pages_dir):
items = listdir(moin_pages_dir)
pathnames = []
for item in items:
item = os.path.join(moin_pages_dir, item)
if isdir(item):
pathnames.append(item)
return pathnames
def get_current_revision(page_dir):
rev_dir = os.path.join(page_dir, 'revisions')
if isdir(rev_dir):
revisions = listdir(rev_dir)
revisions.sort()
return os.path.join(rev_dir, revisions[-1])
return ''
def copy_attachments(page_dir, attachment_dir):
dir = os.path.join(page_dir,'attachments')
if isdir(dir):
attachments = listdir(dir)
#pdb.set_trace()
for attachment in attachments:
cmd_string = 'cp "' + dir +'/' + attachment + '" "' + attachment_dir + attachment.lower() + '"'
os.system ( cmd_string )
def convert_page(page, file):
namespace = ':'
for i in range(0, len(file) - 1):
namespace += file[i] + ':'
regexp = (
('\[\[TableOfContents.*\]\]', ''), # remove
('\[\[BR\]\]$', ''), # newline at end of line - remove
('\[\[BR\]\]', '\n'), # newline
('#pragma section-numbers off', ''), # remove
('^##.*?\\n', ''), # remove
('``', ''), # remove
('\["', '[['), # internal link open
('"\]', ']]'), # internal link close
#('\[:(.*):', [[\\1]] '), # original internal link expressions
#('\[\[(.*)/(.*)\]\]', '[[\\1:\\2]]'),
#('(\[\[.*\]\]).*\]', '\\1'),
# ('\[(http.*) .*\]', '[[\\1]]'), # web link
# ('\[(http.*) (.*)\]', '[[\\1|\\2]]'), # web link
('\[(http[^ ]*) ([^\]]*)\]', '[[\\1|\\2]]'), # web link
# ('\[(http.*)\]', '[[\\1]]'), # web link
('\["/(.*)"\]', '[['+file[-1]+':\\1]]'),
('^\s\s\s\s\*', '\t\t\t*'),
('^\s\s\s\*', '\t\t*'),
('^\s\s\*', '\t*'),
('^\s\*', '*'), # lists must have 2 whitespaces before the asterisk
('^\s\s\s\s1\.', ' -'),
('^\s\s1\.', ' -'),
('^\s1\.', ' -'),
('^\s*=====\s*(.*)\s*=====\s*$', '=-=- \\1 =-=-'), # heading 5
('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'), # heading 4
('^\s*===\s*(.*)\s*===\s*$', '=-=-=-=- \\1 =-=-=-=-'), # heading 3
('^\s*==\s*(.*)\s*==\s*$', '=-=-=-=-=- \\1 =-=-=-=-=-'), # heading 2
('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'), # heading 1
('=-', '='),
('/!\\\\', '**(!)**'), # attention icon
('\|{2}', '|'), # table separator
('\'{5}([^\']*)\'{5}', '**//\\1//**'), # bold and italic
('\'{3}([^\']*)\'{3}', '**\\1**'), # bold
('\'{2}([^\']*)\'{2}', '//\\1//'), # italic
('\{{3}', '\'\''), # open code/verbatim line segment
('\}{3}', '\'\''), # close code/verbatim line segment
('(?<!\[)(\b[A-Z]+[a-z]+[A-Z][A-Za-z]*\b)','[[\\1]]'), # CamelCase, dont change if CamelCase is in InternalLink
('\[\[Date\(([\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}Z)\)\]\]', '\\1'), # Date value
('attachment:(.*)','{{'+namespace+'\\1|}}')
)
for i in range(len(page)):
line = page[i]
for item in regexp:
line = re.sub(item[0], item[1], line)
page[i] = line
return page
def print_help():
print "Usage: moin2zim.py <moinmoin pages directory> <output directory>"
print "Convert MoinMoin pages to ZIM Wiki."
sys.exit(0)
def print_parameter_error():
print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.'
sys.exit(1)
def fix_name( filename ):
# filename = filename.lower()
filename = filename.replace('(2d)', '-') # hyphen
filename = filename.replace('(20)', '_') # space->underscore
filename = filename.replace('(2e)', '_') # decimal point->underscore
filename = filename.replace('(29)', '_') # )->underscore
filename = filename.replace('(28)', '_') # (->underscore
filename = filename.replace('(2b)', '+') #
filename = filename.replace('(2b2b)', '++') #
filename = filename.replace('(2b2b2d)', '++-') #
filename = filename.replace('.', '_') # decimal point->underscore
filename = filename.replace('(2c20)', '_') # comma + space->underscore
filename = filename.replace('(2028)', '_') # space + (->underscore
filename = filename.replace('(2920)', '_') # ) + space->underscore
filename = filename.replace('(2220)', 'inch_') # " + space->inch + underscore
filename = filename.replace('(3a20)', '_') # : + space->underscore
filename = filename.replace('(202827)', '_') # space+(+'->underscore
filename = filename.replace('(2720)', '_') # '+ space->underscore
filename = filename.replace('(c39c)', 'Ue') # umlaut
filename = filename.replace('(c3bc)', 'ue') # umlaut
filename = filename.replace('(c384)', 'Ae') # umlaut
filename = filename.replace('(c3a4)', 'ae') # umlaut
filename = filename.replace('(c3b6)', 'oe') # umlaut
return filename
#
# "main" starts here
#
if len(sys.argv) > 1:
if sys.argv[1] in ('-h', '--help'):
print_help()
elif len(sys.argv) > 2:
moin_pages_dir = sys.argv[1]
output_dir = sys.argv[2]
else:
print_parameter_error()
else:
print_parameter_error()
check_dirs(moin_pages_dir, output_dir)
print 'Input dir is: %s.' % moin_pages_dir
print 'Output dir is: %s.' % output_dir
pathnames = get_path_names(moin_pages_dir)
for pathname in pathnames:
#pdb.set_trace() # start debugging here
curr_rev = get_current_revision( pathname )
if not os.path.exists( curr_rev ) : continue
page_name = basename(pathname)
if page_name.count('MoinEditorBackup') > 0 : continue # don't convert backups
curr_rev_desc = file(curr_rev, 'r')
curr_rev_content = curr_rev_desc.readlines()
curr_rev_desc.close()
page_name = fix_name( page_name )
split = page_name.split('(2f)') # namespaces
count = len(split)
dateiname = split[-1]
dir = output_dir
# changed from attachment_dir = output_dir + '../media/':
attachment_dir = output_dir + 'media/'
if not isdir (attachment_dir):
os.mkdir(attachment_dir)
if count == 1:
dir += 'unsorted'
if not isdir (dir):
os.mkdir(dir)
attachment_dir += 'unsorted/'
if not isdir (attachment_dir):
os.mkdir(attachment_dir)
for i in range(0, count - 1):
dir += split[i] + '/'
if not isdir (dir):
os.mkdir(dir)
attachment_dir += split[i] + '/'
if not isdir (attachment_dir):
os.mkdir(attachment_dir)
if count == 1:
str = 'unsorted/' + page_name
split = str.split('/')
curr_rev_content = convert_page(curr_rev_content, split)
else:
curr_rev_content = convert_page(curr_rev_content, split)
# open the file and add the ZIM header
ts=time.strftime("%Y-%m-%dT%H:%M:%S.0",time.localtime())
out_file = os.path.join(dir, dateiname + '.txt')
out_desc = file(out_file, 'w')
out_desc.write('Content-Type: text/x-zim-wiki\nWiki-Format: zim 0.4\nCreation-Date: ')
out_desc.write(ts)
out_desc.write('\n\n')
#write the content of the page
out_desc.writelines([it.rstrip() + '\n' for it in curr_rev_content if it])
out_desc.close()
# pdb.set_trace() # start debugging here
copy_attachments(pathname, attachment_dir)
file "moin2zim.py":
#!/usr/bin/env python3
"""
Based on the moin2zim.py script by Joerg Desch.
New in this version:
* updated to work in Python 3
* edited some comments
* added the code for the missing Umlaut
"""
########################################################################
# moin2zim.py -- Joerg Desch <joerg DOT desch AT googlemail DOT com>
#
# ... A modification of moin2doku.py by Slim Gaillard
# (see http://www.dokuwiki.org/tips:moinmoin2doku)
#
# IMPORTANT: This script is a quick hack rather than a full project!
#
# This file is a script for converting MoinMoin version 1.3+ wiki data
# to the Zim format. It tries to convert all pages and is not intended
# to convert a single page. You must call it with the name of the
# directory containing the MoinMoin pages as the first parameter and the
# output directory as the second parameter.
#
# example:
# python3 moin2zim.py /moin/data/pages/ /zim-output/
#
# The zim-output directory will contain all the pages with the Zim
# formatting and the Zim page header. It will also contain the images
# and attachments from your MoinMoin wiki. You must copy all of this
# data into your Zim notebook directory in the proper places.
#
# The script doesn't do all the work for you. Some formatting isn't
# supported because it hasn't been coded below and some formatting
# simply isn't available in Zim.
#
# Missing stuff:
# * CamelCase links without [] are not detected as internal links.
# * Verbatim (multiline) blocks (not the marks inside a line) aren't detected.
# The script still uses '' instead of '''!
# * Some unsupported stuff is still in DokuWiki syntax (like colored notification labels, etc.).
# * Currently, only one moin icon is translated.
# * Attachments are neither tested nor checked for compatibility.
# * Potentially, lots of other stuff.
#
# version 0.3 first public release of Joerg Desch's modifications.
# version 0.4 first public release of Elliria's modifications.
########################################################################
import sys, os, os.path, re, pdb, time
from os import listdir
from os.path import isdir, basename
def check_dirs(moin_pages_dir, output_dir):
if not isdir(moin_pages_dir):
# print >> sys.stderr, "MoinMoin pages directory doesn't exist!"
print("MoinMoin pages directory doesn't exist!", file=sys.stderr)
sys.exit(1)
if not isdir(output_dir):
# print >> sys.stderr, "Output directory doesn't exist!"
print("Output directory doesn't exist!", file=sys.stderr)
sys.exit(1)
def get_path_names(moin_pages_dir):
items = listdir(moin_pages_dir)
pathnames = []
for item in items:
item = os.path.join(moin_pages_dir, item)
if isdir(item):
pathnames.append(item)
return pathnames
def get_current_revision(page_dir):
rev_dir = os.path.join(page_dir, 'revisions')
if isdir(rev_dir):
revisions = listdir(rev_dir)
revisions.sort()
return os.path.join(rev_dir, revisions[-1])
return ''
def copy_attachments(page_dir, attachment_dir):
dir = os.path.join(page_dir,'attachments')
if isdir(dir):
attachments = listdir(dir)
#pdb.set_trace()
for attachment in attachments:
cmd_string = 'cp "' + dir +'/' + attachment + '" "' + attachment_dir + attachment.lower() + '"'
os.system ( cmd_string )
def convert_page(page, file):
namespace = ':'
for i in range(0, len(file) - 1):
namespace += file[i] + ':'
regexp = (
('\[\[TableOfContents.*\]\]', ''), # remove
('\[\[BR\]\]$', ''), # newline at end of line - remove
('\[\[BR\]\]', '\n'), # newline
('#pragma section-numbers off', ''), # remove
('^##.*?\\n', ''), # remove
('``', ''), # remove
('\["', '[['), # internal link open
('"\]', ']]'), # internal link close
#('\[:(.*):', [[\\1]] '), # original internal link expressions
#('\[\[(.*)/(.*)\]\]', '[[\\1:\\2]]'),
#('(\[\[.*\]\]).*\]', '\\1'),
#('\[(http.*) .*\]', '[[\\1]]'), # web link
#('\[(http.*) (.*)\]', '[[\\1|\\2]]'), # web link
('\[(http[^ ]*) ([^\]]*)\]', '[[\\1|\\2]]'), # web link
#('\[(http.*)\]', '[[\\1]]'), # web link
('\["/(.*)"\]', '[['+file[-1]+':\\1]]'),
('^\s\s\s\s\*', '\t\t\t*'),
('^\s\s\s\*', '\t\t*'),
('^\s\s\*', '\t*'),
('^\s\*', '*'), # lists must have 2 whitespaces before the asterisk
('^\s\s\s\s1\.', ' -'),
('^\s\s1\.', ' -'),
('^\s1\.', ' -'),
('^\s*=====\s*(.*)\s*=====\s*$', '=-=- \\1 =-=-'), # heading 5
('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'), # heading 4
('^\s*===\s*(.*)\s*===\s*$', '=-=-=-=- \\1 =-=-=-=-'), # heading 3
('^\s*==\s*(.*)\s*==\s*$', '=-=-=-=-=- \\1 =-=-=-=-=-'), # heading 2
('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'), # heading 1
('=-', '='),
('/!\\\\', '**(!)**'), # attention icon
('\|{2}', '|'), # table separator
('\'{5}([^\']*)\'{5}', '**//\\1//**'), # bold and italic
('\'{3}([^\']*)\'{3}', '**\\1**'), # bold
('\'{2}([^\']*)\'{2}', '//\\1//'), # italic
('\{{3}', '\'\''), # open code/verbatim line segment
('\}{3}', '\'\''), # close code/verbatim line segment
('(?<!\[)(\b[A-Z]+[a-z]+[A-Z][A-Za-z]*\b)','[[\\1]]'), # CamelCase, dont change if CamelCase is in InternalLink
('\[\[Date\(([\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}Z)\)\]\]', '\\1'), # Date value
('attachment:(.*)','{{'+namespace+'\\1|}}')
)
for i in range(len(page)):
line = page[i]
for item in regexp:
line = re.sub(item[0], item[1], line)
page[i] = line
return page
def print_help():
print("Usage: moin2zim.py <moinmoin pages directory> <output directory>")
print("Convert MoinMoin pages to ZIM Wiki.")
sys.exit(0)
def print_parameter_error():
# print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.'
print("Incorrect parameters! Use --help switch to learn more.", file=sys.stderr)
sys.exit(1)
def fix_name( filename ):
#filename = filename.lower()
filename = filename.replace('(2d)', '-') # hyphen
filename = filename.replace('(20)', '_') # space->underscore
filename = filename.replace('(2e)', '_') # decimal point->underscore
filename = filename.replace('(29)', '_') # )->underscore
filename = filename.replace('(28)', '_') # (->underscore
filename = filename.replace('(2b)', '+') #
filename = filename.replace('(2b2b)', '++') #
filename = filename.replace('(2b2b2d)', '++-') #
filename = filename.replace('.', '_') # decimal point->underscore
filename = filename.replace('(2c20)', '_') # comma + space->underscore
filename = filename.replace('(2028)', '_') # space + (->underscore
filename = filename.replace('(2920)', '_') # ) + space->underscore
filename = filename.replace('(2220)', 'inch_') # " + space->inch + underscore
filename = filename.replace('(3a20)', '_') # : + space->underscore
filename = filename.replace('(202827)', '_') # space+(+'->underscore
filename = filename.replace('(2720)', '_') # '+ space->underscore
filename = filename.replace('(c39c)', 'Ue') # umlaut
filename = filename.replace('(c3bc)', 'ue') # umlaut
filename = filename.replace('(c384)', 'Ae') # umlaut
filename = filename.replace('(c3a4)', 'ae') # umlaut
filename = filename.replace('(00D6)', 'Oe') # umlaut
filename = filename.replace('(c3b6)', 'oe') # umlaut
return filename
#
# "main" starts here
#
if len(sys.argv) > 1:
if sys.argv[1] in ('-h', '--help'):
print_help()
elif len(sys.argv) > 2:
moin_pages_dir = sys.argv[1]
output_dir = sys.argv[2]
else:
print_parameter_error()
else:
print_parameter_error()
check_dirs(moin_pages_dir, output_dir)
print('Input dir is: %s.' % moin_pages_dir)
print('Output dir is: %s.' % output_dir)
pathnames = get_path_names(moin_pages_dir)
for pathname in pathnames:
#pdb.set_trace() # start debugging here
curr_rev = get_current_revision( pathname )
if not os.path.exists( curr_rev ) : continue
page_name = basename(pathname)
if page_name.count('MoinEditorBackup') > 0 : continue # don't convert backups
curr_rev_desc = open(curr_rev, 'r')
curr_rev_content = curr_rev_desc.readlines()
curr_rev_desc.close()
page_name = fix_name( page_name )
split = page_name.split('(2f)') # namespaces
count = len(split)
dateiname = split[-1]
dir = output_dir
# changed from attachment_dir = output_dir + '../media/':
attachment_dir = output_dir + 'media/'
if not isdir (attachment_dir):
os.mkdir(attachment_dir)
if count == 1:
dir += 'unsorted'
if not isdir (dir):
os.mkdir(dir)
attachment_dir += 'unsorted/'
if not isdir (attachment_dir):
os.mkdir(attachment_dir)
for i in range(0, count - 1):
dir += split[i] + '/'
if not isdir (dir):
os.mkdir(dir)
attachment_dir += split[i] + '/'
if not isdir (attachment_dir):
os.mkdir(attachment_dir)
if count == 1:
str = 'unsorted/' + page_name
split = str.split('/')
curr_rev_content = convert_page(curr_rev_content, split)
else:
curr_rev_content = convert_page(curr_rev_content, split)
# open the file and add the ZIM header
ts=time.strftime("%Y-%m-%dT%H:%M:%S.0",time.localtime())
out_file = os.path.join(dir, dateiname + '.txt')
out_desc = open(out_file, 'w')
out_desc.write('Content-Type: text/x-zim-wiki\nWiki-Format: zim 0.4\nCreation-Date: ')
out_desc.write(ts)
out_desc.write('\n\n')
# write the content of the page
out_desc.writelines([it.rstrip() + '\n' for it in curr_rev_content if it])
out_desc.close()
# pdb.set_trace() # start debugging here
copy_attachments(pathname, attachment_dir)