#!/usr/bin/env python # coding=utf-8 import os import random import re import shutil import string import subprocess import sys import xml.etree.ElementTree as ET from PIL import Image # apt-get install python-imaging # to run: cd ~/src/wiki && rm -rf blog/entry && g co blog/entry && cd # ~/lib/wikiannex && git clean -f && rm -rf # blog/img/{jhcoip,oldtech,oliscrot} && cd $HOME && # orgblosxom2ikiwiki.py && rm # ~/lib/wikiannex/blog/img/{jhcoip,oldtech,oliscrot}/*thumb* # everything in Unicode please reload(sys) sys.setdefaultencoding('utf-8') # input POSTS = "/home/swhitton/local/big/blog" COMMENTS = "/home/swhitton/local/big/comments" # output ENTRIES = "/home/swhitton/src/wiki/blog/entry" IMAGES = "/home/swhitton/lib/wikiannex/blog/img" def strip_smarts(text): return text.replace(u"“", "\"").replace(u"”", "\"").replace(u"’", "\'").replace(u"‘", "\'").replace(u"—", "---").replace(u"–", "--").replace(u"…", "...") def fix_images(text): fixed = [] for line in text.splitlines(): match = re.match(r'\[!\[\]\(http://spw.sdf.org/blog/(.*)\)\]\(http://spw.sdf.org/blog/(.*)\)', line) if match: thumb = match.group(1) image = match.group(2) contents = os.listdir(os.path.join(POSTS, os.path.dirname(image))) exts = map(lambda x: os.path.splitext(x)[1], contents) if ".org" not in exts: # dedicated image dir link_path = os.path.join(os.path.dirname(image).rsplit("/", 1)[1], os.path.basename(image)) else: link_path = os.path.basename(image) im = Image.open(os.path.join(POSTS, thumb)) im_width, im_height = im.size dimensions = str(im_width) + "x" + str(im_height) fixed.append("[[!img blog/img/" + link_path + " size=" + dimensions + "]]") else: fixed.append(line) return "\n".join(fixed) def fix_more(text): before, more, after = map(lambda s: s.strip(), text.partition("BREAK")) if "\nBREAK\n" in text: return "\n".join([before + "\n", "[[!more linktext=\"continue reading this entry\" pages=\"!blog/entry/*\" text=\"\"\"", after, "\"\"\"]]"]) elif " BREAK " in text: return before + " [[!more linktext=\"continue reading this entry\" pages=\"!blog/entry/*\" text=\"\"\"" + after + "\n\"\"\"]]" else: return text def convert_post(post): with open(post, 'r') as h: org = h.read() title = org.splitlines()[0].replace('#+HTML: ', '') title = strip_smarts(title) title = "[[!meta title=\"" + title + "\"]]" date = org.splitlines()[1].replace('#+HTML: #published ', '') date = "[[!meta date=\"" + date + "\"]]" tags = os.path.dirname(post).replace(POSTS, "")[1:].replace("/", " ") tags = "[[!tag imported_PyBlosxom " + tags + "]]" # this file generates a pandoc error: # /home/swhitton/local/big/blog/linkdump/novdec14.org pandoc = subprocess.Popen(["pandoc", "-f", "org", "-t", "markdown_strict"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) body, error = pandoc.communicate(input=org) body = strip_smarts(body) body = fix_images(body) body = fix_more(body) return "\n".join([date, title, tags, "", body]) def convert_comment(comment): # print "attempting to parse", comment tree = ET.parse(comment) root = tree.getroot() # reference: # [[!comment format=mdwn # username="spwhitton@171b57686690088a367b4b10ddf73c4ca6f16601" # nickname="spwhitton" # avatar="http://cdn.libravatar.org/avatar/40da86a5d03e6fa62515a9d762601ed2" # subject="And a second one, gravatar free" # date="2015-11-11T00:18:46Z" # content=""" # Here it is # """]] slug = os.path.basename(root.find('parent').text) address = root.find('email') if address == None: username = root.find('author').text else: username = address.text.partition('@')[0] desc = root.find('description').text pandoc = subprocess.Popen(["pandoc", "-f", "html", "-t", "markdown_strict"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) desc, error = pandoc.communicate(input=desc) comment = "\n".join([ "[[!comment format=mdwn", " username=\"" + username + "\"", " nickname=\"" + root.find('author').text + "\"", " date=\"" + root.find('w3cdate').text + "\"", " content=\"\"\"", desc + "\"\"\"]]" ]) the_dir = os.path.join(ENTRIES, slug) if not os.path.exists(the_dir): os.mkdir(the_dir, 0755) rands = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(16)) # ^ http://stackoverflow.com/a/2257449 if not comment.endswith("\n"): comment = comment + "\n" with open(os.path.join(ENTRIES, slug, "comment_" + rands + "._comment"), 'w') as h: h.write(comment) def main(): for root, dirs, files in os.walk(POSTS): # skip all the templates stored in root of blog if root == POSTS or root.startswith("/home/swhitton/local/big/blog/.git"): continue # 1. If there's no .org in this dir and we're at the bottom of # a tree, then it's a dir for images only, so copy it # verbatim. And we know from inspection with old Haskell # script that there are no conflicts other than inside these # image-only directories exts = map(lambda x: os.path.splitext(x)[1], files) if ".org" not in exts and not any(dirs): dest = os.path.join(IMAGES, os.path.basename(root)) if not os.path.exists(dest): shutil.copytree(root, dest) # 2. now convert posts and images in the usual way else: for f in files: ext = os.path.splitext(f)[1] if ext == ".org": # convert_post, unlike convert_comment, relies on # us to decide where to save it post = convert_post(os.path.join(root, f)) fname = os.path.join(ENTRIES, os.path.splitext(f)[0] + ".mdwn") if os.path.exists(fname): # safety if inspection wrong print "uh oh! conflict! " + fname + " exists!" sys.exit() else: if not post.endswith("\n"): post = post + "\n" with open(fname, 'w') as h: h.write(post) elif "thumb." not in f: if os.path.exists(os.path.join(IMAGES, f)): # safety if inspection wrong print "uh oh! conflict! " + os.path.join(IMAGES, f) + " exists!" sys.exit() else: shutil.copy(os.path.join(root, f), IMAGES) for root, dirs, files in os.walk(COMMENTS): for f in files: if os.path.splitext(f)[1] == ".cmt": # convert_comment does the saving since the post to which # the comment is associated is stored within the comment convert_comment(os.path.join(root, f)) if __name__ == "__main__": main()