summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSean Whitton <spwhitton@spwhitton.name>2015-11-18 10:57:49 -0700
committerSean Whitton <spwhitton@spwhitton.name>2015-11-18 10:57:49 -0700
commit96135c8fc043319b09d328ee432aafaa414a0bfd (patch)
tree771787da09eccbb864be123f49ed6332435f0d7d
parentdff3b59fe11bd00da9ecbc7dbfe804a61b0f100b (diff)
downloaddotfiles-96135c8fc043319b09d328ee432aafaa414a0bfd.tar.gz
Squashed commit of the following:
commit 420f675683aeb4b8573fee105534e1443dbca2e3 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 18 07:55:44 2015 -0700 fix image link regexp commit f6aaf71d419653baf0668ccfcecfda3aeadcdc5c Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 18 07:55:38 2015 -0700 strip Unicode from post titles, too commit 971309375e1c70ae94f7a6c7423a003f39b8d075 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 18 07:55:34 2015 -0700 comment commit b0e23e35c50479ad6b9e36133994da0f162f46d4 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 23:06:02 2015 -0700 fix some linebreaks commit 9ccbac9b7775cb5251a3b6ee5162ed576fbe33a7 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 23:00:07 2015 -0700 convert comments html -> mdwn commit f13e550a68448518ecbaf7509443801c49ae1515 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:54:24 2015 -0700 fix comment dirs mode commit 12c931d0b37ff5c995e35ae9ebc9e58668884d88 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:51:09 2015 -0700 don't copy my manually-generated thumbnails commit d09c52166286fea919b27489932adcc8257026ce Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:44:19 2015 -0700 avoid copytree being called more than once Kills program because copytree will not proceed if destination exists. commit 0717764894d2365270f5a6b4eae8d059d6038b9d Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:44:11 2015 -0700 another attempt to filter out .git commit eec0fa6be821c851a19c0cad9ee5fde7c2bd9b6a Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:44:06 2015 -0700 comment commit 36258e00f476d1172ebe900f4d7243d43948dd13 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:43:58 2015 -0700 MORE -> BREAK commit 8ba49ce02e28ed483b1e8f87a22dcdb505277128 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:23:53 2015 -0700 rm another debug print commit 58b23ee2972130c3065df5e49fd3b650e2f8d278 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:23:38 2015 -0700 Revert "for debugging purposes: process max. 10" This reverts commit ec3d6fa39c75418c9d1a26f3f337d1ec6082404f. Conflicts: bin/orgblosxom2ikiwiki.py commit 72354c2565cdf81cdf9544b9ede8b36819b1fafc Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:22:26 2015 -0700 assorted bug fixes in response to first run commit ec3d6fa39c75418c9d1a26f3f337d1ec6082404f Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 21:52:34 2015 -0700 for debugging purposes: process max. 10 commit 0a5e9eaa82e0f56d31c2bfa6fe76f8f5787039be Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 21:52:21 2015 -0700 actually finished (haha!): fix image embeds commit bbc5401404980018fdf4b43ac9374dc17e358a92 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:37:52 2015 -0700 comment commit 493dca1b1b1e782ac33ddc1d19e249b218c36ddc Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:37:04 2015 -0700 got to fix_images() too commit 3cbc7e4a9abf115a6666d7880c35e8a5843e2bb0 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:32:21 2015 -0700 finish implementation! ha commit 7b6d54c50af9971c17b764a5715938a83d08fe18 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:25:33 2015 -0700 slug & reference comment commit 5f8b293edb766142457633fe5dcb3e3235e8c5a0 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:14:57 2015 -0700 fix extension joining (may be unnecessary) commit 36253c644f1d14c27bc3f807f89dc4b16c023d45 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:04:48 2015 -0700 convert_comment() skeleton commit 1c23e83e81b7cc7cbf434332742f71b200cbafa1 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 21:49:22 2015 -0700 convert_post written commit 8ba3295d5b2a15c9c7f5b9d36ef1ef9a3473db82 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 19:39:08 2015 -0700 orgblosxom2ikiwiki.py main() fn written commit 7634d64890ab73e089c7174e4401a5195d690a63 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 13:28:49 2015 -0700 fix parseMeta commit f7fd8b331fce6959d699e2dd5b1c3959f7d0a8da Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 13:23:04 2015 -0700 record duplicates and invert code commenting-out commit 611f2e83b9ae27ae78e49e3768871f6d959f1561 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 13:18:54 2015 -0700 finally: code to find duplicates commit 7cdf0cb0e0c1a747bffa989c1925463a5c749d4f Author: Sean Whitton <spwhitton@spwhitton.name> Date: Sun Nov 15 22:15:56 2015 -0700 (broken) remNonDupes commit c8cd1998725d3092db2b6d28d8ac0b5069729dc0 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Sun Nov 15 18:55:38 2015 -0700 checkForDuplicates hacked together commit e10b39d6189650a033bf41c93cf8af9d05409c3d Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 11 19:38:39 2015 -0700 parseCat, parseMeta and start work on main commit 1da32fe285523ea2a29286ba9181ff7b0f4a6d5a Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 11 19:04:32 2015 -0700 notes commit ef8b8939208ce7473850f9063d343901d4c672d5 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 11 08:36:01 2015 -0700 note commit 016ce14f64fd544e9088ebebcaf0c9094e8b2500 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 10 22:23:25 2015 -0700 start working on orgblosxom2ikiwiki
-rwxr-xr-xbin/orgblosxom2ikiwiki.py190
-rw-r--r--src/hscripts/hscripts.cabal10
-rw-r--r--src/hscripts/src/orgblosxom2ikiwiki.hs94
3 files changed, 294 insertions, 0 deletions
diff --git a/bin/orgblosxom2ikiwiki.py b/bin/orgblosxom2ikiwiki.py
new file mode 100755
index 00000000..6d177e1d
--- /dev/null
+++ b/bin/orgblosxom2ikiwiki.py
@@ -0,0 +1,190 @@
+#!/usr/bin/python
+# coding=utf-8
+
+import os
+import random
+import re
+import shutil
+import string
+import subprocess
+import sys
+import xml.etree.ElementTree as ET
+from PIL import Image # apt-get install python-imaging
+
+# to run: cd ~/src/wiki && rm -rf blog/entry && g co blog/entry && cd
+# ~/lib/wikiannex && git clean -f && rm -rf
+# blog/img/{jhcoip,oldtech,oliscrot} && cd $HOME &&
+# orgblosxom2ikiwiki.py && rm
+# ~/lib/wikiannex/blog/img/{jhcoip,oldtech,oliscrot}/*thumb*
+
+# everything in Unicode please
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+# input
+POSTS = "/home/swhitton/local/big/blog"
+COMMENTS = "/home/swhitton/local/big/comments"
+# output
+ENTRIES = "/home/swhitton/src/wiki/blog/entry"
+IMAGES = "/home/swhitton/lib/wikiannex/blog/img"
+
+def strip_smarts(text):
+ return text.replace(u"“", "\"").replace(u"”", "\"").replace(u"’", "\'").replace(u"‘", "\'").replace(u"—", "---").replace(u"–", "--").replace(u"…", "...")
+
+def fix_images(text):
+ fixed = []
+
+ for line in text.splitlines():
+ match = re.match(r'\[!\[\]\(http://spw.sdf.org/blog/(.*)\)\]\(http://spw.sdf.org/blog/(.*)\)', line)
+ if match:
+ thumb = match.group(1)
+ image = match.group(2)
+ contents = os.listdir(os.path.join(POSTS, os.path.dirname(image)))
+ exts = map(lambda x: os.path.splitext(x)[1], contents)
+ if ".org" not in exts: # dedicated image dir
+ link_path = os.path.join(os.path.dirname(image).rsplit("/", 1)[1], os.path.basename(image))
+ else:
+ link_path = os.path.basename(image)
+
+ im = Image.open(os.path.join(POSTS, thumb))
+ im_width, im_height = im.size
+ dimensions = str(im_width) + "x" + str(im_height)
+ fixed.append("[[!img blog/img/" + link_path + " size=" + dimensions + "]]")
+ else:
+ fixed.append(line)
+
+ return "\n".join(fixed)
+
+def fix_more(text):
+ before, more, after = map(lambda s: s.strip(), text.partition("BREAK"))
+ if "\nBREAK\n" in text:
+ return "\n".join([before + "\n", "[[!more linktext=\"continue reading this entry\" pages=\"!blog/entry/*\" text=\"\"\"", after, "\"\"\"]]"])
+ elif " BREAK " in text:
+ return before + " [[!more linktext=\"continue reading this entry\" pages=\"!blog/entry/*\" text=\"\"\"" + after + "\n\"\"\"]]"
+ else:
+ return text
+
+def convert_post(post):
+ with open(post, 'r') as h:
+ org = h.read()
+
+ title = org.splitlines()[0].replace('#+HTML: ', '')
+ title = strip_smarts(title)
+ title = "[[!meta title=\"" + title + "\"]]"
+
+ date = org.splitlines()[1].replace('#+HTML: #published ', '')
+ date = "[[!meta date=\"" + date + "\"]]"
+
+ tags = os.path.dirname(post).replace(POSTS, "")[1:].replace("/", " ")
+ tags = "[[!tag imported_PyBlosxom " + tags + "]]"
+
+ # this file generates a pandoc error:
+ # /home/swhitton/local/big/blog/linkdump/novdec14.org
+ pandoc = subprocess.Popen(["pandoc", "-f", "org", "-t", "markdown_strict"],
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ body, error = pandoc.communicate(input=org)
+
+ body = strip_smarts(body)
+ body = fix_images(body)
+ body = fix_more(body)
+
+ return "\n".join([date, title, tags, "", body])
+
+def convert_comment(comment):
+ # print "attempting to parse", comment
+ tree = ET.parse(comment)
+ root = tree.getroot()
+
+ # reference:
+
+ # [[!comment format=mdwn
+ # username="spwhitton@171b57686690088a367b4b10ddf73c4ca6f16601"
+ # nickname="spwhitton"
+ # avatar="http://cdn.libravatar.org/avatar/40da86a5d03e6fa62515a9d762601ed2"
+ # subject="And a second one, gravatar free"
+ # date="2015-11-11T00:18:46Z"
+ # content="""
+ # Here it is
+ # """]]
+
+ slug = os.path.basename(root.find('parent').text)
+ address = root.find('email')
+ if address == None:
+ username = root.find('author').text
+ else:
+ username = address.text.partition('@')[0]
+
+ desc = root.find('description').text
+ pandoc = subprocess.Popen(["pandoc", "-f", "html", "-t", "markdown_strict"],
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ desc, error = pandoc.communicate(input=desc)
+
+ comment = "\n".join([
+ "[[!comment format=mdwn",
+ " username=\"" + username + "\"",
+ " nickname=\"" + root.find('author').text + "\"",
+ " date=\"" + root.find('w3cdate').text + "\"",
+ " content=\"\"\"",
+ desc + "\"\"\"]]"
+ ])
+
+ the_dir = os.path.join(ENTRIES, slug)
+ if not os.path.exists(the_dir):
+ os.mkdir(the_dir, 0755)
+ rands = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(16))
+ # ^ http://stackoverflow.com/a/2257449
+ if not comment.endswith("\n"):
+ comment = comment + "\n"
+ with open(os.path.join(ENTRIES, slug, "comment_" + rands + "._comment"), 'w') as h:
+ h.write(comment)
+
+def main():
+ for root, dirs, files in os.walk(POSTS):
+
+ # skip all the templates stored in root of blog
+ if root == POSTS or root.startswith("/home/swhitton/local/big/blog/.git"):
+ continue
+
+ # 1. If there's no .org in this dir and we're at the bottom of
+ # a tree, then it's a dir for images only, so copy it
+ # verbatim. And we know from inspection with old Haskell
+ # script that there are no conflicts other than inside these
+ # image-only directories
+ exts = map(lambda x: os.path.splitext(x)[1], files)
+ if ".org" not in exts and not any(dirs):
+ dest = os.path.join(IMAGES, os.path.basename(root))
+ if not os.path.exists(dest):
+ shutil.copytree(root, dest)
+ # 2. now convert posts and images in the usual way
+ else:
+ for f in files:
+ ext = os.path.splitext(f)[1]
+ if ext == ".org":
+ # convert_post, unlike convert_comment, relies on
+ # us to decide where to save it
+ post = convert_post(os.path.join(root, f))
+ fname = os.path.join(ENTRIES, os.path.splitext(f)[0] + ".mdwn")
+ if os.path.exists(fname): # safety if inspection wrong
+ print "uh oh! conflict! " + fname + " exists!"
+ sys.exit()
+ else:
+ if not post.endswith("\n"):
+ post = post + "\n"
+ with open(fname, 'w') as h:
+ h.write(post)
+ elif "thumb." not in f:
+ if os.path.exists(os.path.join(IMAGES, f)): # safety if inspection wrong
+ print "uh oh! conflict! " + os.path.join(IMAGES, f) + " exists!"
+ sys.exit()
+ else:
+ shutil.copy(os.path.join(root, f), IMAGES)
+
+ for root, dirs, files in os.walk(COMMENTS):
+ for f in files:
+ if os.path.splitext(f)[1] == ".cmt":
+ # convert_comment does the saving since the post to which
+ # the comment is associated is stored within the comment
+ convert_comment(os.path.join(root, f))
+
+if __name__ == "__main__":
+ main()
diff --git a/src/hscripts/hscripts.cabal b/src/hscripts/hscripts.cabal
index c73ca53a..aa047fee 100644
--- a/src/hscripts/hscripts.cabal
+++ b/src/hscripts/hscripts.cabal
@@ -18,3 +18,13 @@ executable video4ipad
, optparse-applicative
, process
default-language: Haskell2010
+
+executable orgblosxom2ikiwiki
+ hs-source-dirs: src
+ main-is: orgblosxom2ikiwiki.hs
+ build-depends: base
+ , MissingH
+ , filemanip
+ , directory
+ , filepath
+ default-language: Haskell2010
diff --git a/src/hscripts/src/orgblosxom2ikiwiki.hs b/src/hscripts/src/orgblosxom2ikiwiki.hs
new file mode 100644
index 00000000..81ee8701
--- /dev/null
+++ b/src/hscripts/src/orgblosxom2ikiwiki.hs
@@ -0,0 +1,94 @@
+-- 0. check for filename collisions and generate .htaccess entry
+-- 1. parse first two lines: produce date=, modified= and title= (second one just in case it can be made to take effect)
+-- 2. pass remainder through pandoc (with options to use old-style markdown: for example, need a blank line between a paragraph and a list)
+-- 3. fix MORE and links in bottom part, and add tags (and imported_pyblosxom tag)
+-- 4. put file into entries dir, and images into wikiannex
+
+-- There are no blog post filename conflicts. There are the following attachment conflicts:
+
+-- ["/home/swhitton/local/big/blog/korea/epik/classroom.jpg"
+-- ,"/home/swhitton/local/big/blog/writing/diary/jhcoip/classroom.jpg"
+-- ,"/home/swhitton/local/big/blog/korea/epik/classroomthumb.jpg"
+-- ,"/home/swhitton/local/big/blog/writing/diary/jhcoip/classroomthumb.jpg"
+-- ,"/home/swhitton/local/big/blog/tech/emacs/org-mode/freemindeg.html_files/icons/help.png"
+-- ,"/home/swhitton/local/big/blog/oxford/oliscrot/help.png"
+-- ,"/home/swhitton/local/big/blog/korea/epik/temple.jpg"
+-- ,"/home/swhitton/local/big/blog/writing/diary/jhcoip/temple.jpg"
+-- ,"/home/swhitton/local/big/blog/korea/epik/templethumb.jpg"
+-- ,"/home/swhitton/local/big/blog/writing/diary/jhcoip/templethumb.jpg"]
+
+import Data.String.Utils (replace)
+import System.FilePath.Find
+
+-- import System.Directory
+-- import System.FilePath
+
+-- import Control.Monad
+-- import Data.List
+
+convertEntry :: String -> String -> String
+convertEntry category source = unlines $ meta ++ tags ++ body
+ where
+ meta = parseMeta . take 2 . lines $ source
+ tags = parseCat category
+ body = convertBody . unlines . drop 2 . lines $ source
+
+-- work by replacing '/' with ' ' in strings like "/tech/gnu+linux"
+parseCat :: String -> [String]
+parseCat category = ("[[!tag" ++ (replace "/" " " category) ++ " ]]"):[]
+
+parseMeta :: [String] -> [String]
+parseMeta (title:date:[]) = ("[[!meta title=\"" ++ drop 7 title ++ "\"]]")
+ : ("[[!meta date=\"" ++ drop 19 date ++ "\"]]")
+ : []
+
+convertBody :: String -> [String]
+
+main = do
+ let rootDir = "/home/swhitton/doc/www/blog"
+ entries <- find (pure True) (extension ==? ".org") rootDir
+ attachments <- find (pure True) (not <$> extension ==? ".org") rootDir
+
+ undefined
+
+-- getDirectoryContentsRecursive :: FilePath -> IO [FilePath]
+-- getDirectoryContentsRecursive path = do
+-- names <- filter (`notElem` [".", "..", ".git"]) <$> getDirectoryContents path
+-- liftM concat $ forM names $ \name -> do
+-- dir <- doesDirectoryExist (path </> name)
+-- if dir
+-- then getDirectoryContentsRecursive (path </> name)
+-- else return [path </> name]
+
+-- checkForDuplicates :: IO ()
+-- checkForDuplicates = do
+-- names <- (++)
+-- <$> getDirectoryContentsRecursive "/home/swhitton/local/big/blog"
+-- <*> getDirectoryContentsRecursive "/home/swhitton/src/wiki/blog/entry"
+-- let posts = takeBaseName <$> filter ((`elem` [".org", ".mdwn"]) . takeExtension) names
+-- let other = filter ((`notElem` [".org", ".mdwn"]) . takeExtension) names
+-- let other' = takeBaseName <$> other
+-- -- let otherDupes = remNonDupes . sortOn takeBaseName $ other
+-- let otherDupes = sortOn takeBaseName [ x | x <- other, takeBaseName x `elem` (takeBaseName <$> (other `listMinus` x))]
+-- if nub posts == posts
+-- then putStrLn "no post filenames conflict" -- this gets printed
+-- else putStrLn "oh no! post filenames conflict!"
+-- if nub other' == other'
+-- then putStrLn "no attachment filenames conflict"
+-- else putStrLn "oh no! attachment filenames conflict!" -- this gets printed!
+-- putStrLn . show $ otherDupes
+
+-- remNonDupes :: (Eq a) => [a] -> [a]
+-- remNonDupes = remNonDupes' []
+
+-- remNonDupes' :: (Eq a) => [a] -> [a] -> [a]
+-- remNonDupes' ys [] = ys
+-- remNonDupes' ys (x:xs) = if x `elem` xs || x `elem` ys
+-- then remNonDupes' (ys ++ [x]) xs
+-- else remNonDupes' ys xs
+
+-- main :: IO ()
+-- main = checkForDuplicates
+
+-- listMinus :: (Eq a) => [a] -> a -> [a]
+-- listMinus (x:xs) y = if x == y then xs else x:listMinus xs y