Squashed commit of the following:

commit 420f675683aeb4b8573fee105534e1443dbca2e3 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 18 07:55:44 2015 -0700 fix image link regexp commit f6aaf71d419653baf0668ccfcecfda3aeadcdc5c Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 18 07:55:38 2015 -0700 strip Unicode from post titles, too commit 971309375e1c70ae94f7a6c7423a003f39b8d075 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 18 07:55:34 2015 -0700 comment commit b0e23e35c50479ad6b9e36133994da0f162f46d4 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 23:06:02 2015 -0700 fix some linebreaks commit 9ccbac9b7775cb5251a3b6ee5162ed576fbe33a7 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 23:00:07 2015 -0700 convert comments html -> mdwn commit f13e550a68448518ecbaf7509443801c49ae1515 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:54:24 2015 -0700 fix comment dirs mode commit 12c931d0b37ff5c995e35ae9ebc9e58668884d88 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:51:09 2015 -0700 don't copy my manually-generated thumbnails commit d09c52166286fea919b27489932adcc8257026ce Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:44:19 2015 -0700 avoid copytree being called more than once Kills program because copytree will not proceed if destination exists. commit 0717764894d2365270f5a6b4eae8d059d6038b9d Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:44:11 2015 -0700 another attempt to filter out .git commit eec0fa6be821c851a19c0cad9ee5fde7c2bd9b6a Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:44:06 2015 -0700 comment commit 36258e00f476d1172ebe900f4d7243d43948dd13 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:43:58 2015 -0700 MORE -> BREAK commit 8ba49ce02e28ed483b1e8f87a22dcdb505277128 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:23:53 2015 -0700 rm another debug print commit 58b23ee2972130c3065df5e49fd3b650e2f8d278 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:23:38 2015 -0700 Revert "for debugging purposes: process max. 10" This reverts commit ec3d6fa39c75418c9d1a26f3f337d1ec6082404f. Conflicts: bin/orgblosxom2ikiwiki.py commit 72354c2565cdf81cdf9544b9ede8b36819b1fafc Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 22:22:26 2015 -0700 assorted bug fixes in response to first run commit ec3d6fa39c75418c9d1a26f3f337d1ec6082404f Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 21:52:34 2015 -0700 for debugging purposes: process max. 10 commit 0a5e9eaa82e0f56d31c2bfa6fe76f8f5787039be Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 17 21:52:21 2015 -0700 actually finished (haha!): fix image embeds commit bbc5401404980018fdf4b43ac9374dc17e358a92 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:37:52 2015 -0700 comment commit 493dca1b1b1e782ac33ddc1d19e249b218c36ddc Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:37:04 2015 -0700 got to fix_images() too commit 3cbc7e4a9abf115a6666d7880c35e8a5843e2bb0 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:32:21 2015 -0700 finish implementation! ha commit 7b6d54c50af9971c17b764a5715938a83d08fe18 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:25:33 2015 -0700 slug & reference comment commit 5f8b293edb766142457633fe5dcb3e3235e8c5a0 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:14:57 2015 -0700 fix extension joining (may be unnecessary) commit 36253c644f1d14c27bc3f807f89dc4b16c023d45 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 22:04:48 2015 -0700 convert_comment() skeleton commit 1c23e83e81b7cc7cbf434332742f71b200cbafa1 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 21:49:22 2015 -0700 convert_post written commit 8ba3295d5b2a15c9c7f5b9d36ef1ef9a3473db82 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 19:39:08 2015 -0700 orgblosxom2ikiwiki.py main() fn written commit 7634d64890ab73e089c7174e4401a5195d690a63 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 13:28:49 2015 -0700 fix parseMeta commit f7fd8b331fce6959d699e2dd5b1c3959f7d0a8da Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 13:23:04 2015 -0700 record duplicates and invert code commenting-out commit 611f2e83b9ae27ae78e49e3768871f6d959f1561 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Mon Nov 16 13:18:54 2015 -0700 finally: code to find duplicates commit 7cdf0cb0e0c1a747bffa989c1925463a5c749d4f Author: Sean Whitton <spwhitton@spwhitton.name> Date: Sun Nov 15 22:15:56 2015 -0700 (broken) remNonDupes commit c8cd1998725d3092db2b6d28d8ac0b5069729dc0 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Sun Nov 15 18:55:38 2015 -0700 checkForDuplicates hacked together commit e10b39d6189650a033bf41c93cf8af9d05409c3d Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 11 19:38:39 2015 -0700 parseCat, parseMeta and start work on main commit 1da32fe285523ea2a29286ba9181ff7b0f4a6d5a Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 11 19:04:32 2015 -0700 notes commit ef8b8939208ce7473850f9063d343901d4c672d5 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Wed Nov 11 08:36:01 2015 -0700 note commit 016ce14f64fd544e9088ebebcaf0c9094e8b2500 Author: Sean Whitton <spwhitton@spwhitton.name> Date: Tue Nov 10 22:23:25 2015 -0700 start working on orgblosxom2ikiwiki
author: Sean Whitton <spwhitton@spwhitton.name> 2015-11-18 10:57:49 -0700
committer: Sean Whitton <spwhitton@spwhitton.name> 2015-11-18 10:57:49 -0700
commit: 96135c8fc043319b09d328ee432aafaa414a0bfd (patch)
tree: 771787da09eccbb864be123f49ed6332435f0d7d
parent: dff3b59fe11bd00da9ecbc7dbfe804a61b0f100b (diff)
download: dotfiles-96135c8fc043319b09d328ee432aafaa414a0bfd.tar.gz
3 files changed, 294 insertions, 0 deletions
diff --git a/bin/orgblosxom2ikiwiki.py b/bin/orgblosxom2ikiwiki.py
new file mode 100755
index 00000000..6d177e1d
--- /dev/null
+++ b/bin/orgblosxom2ikiwiki.py
@@ -0,0 +1,190 @@
+#!/usr/bin/python
+# coding=utf-8
+
+import os
+import random
+import re
+import shutil
+import string
+import subprocess
+import sys
+import xml.etree.ElementTree as ET
+from PIL import Image           # apt-get install python-imaging
+
+# to run: cd ~/src/wiki && rm -rf blog/entry && g co blog/entry && cd
+# ~/lib/wikiannex && git clean -f && rm -rf
+# blog/img/{jhcoip,oldtech,oliscrot} && cd $HOME &&
+# orgblosxom2ikiwiki.py && rm
+# ~/lib/wikiannex/blog/img/{jhcoip,oldtech,oliscrot}/*thumb*
+
+# everything in Unicode please
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+# input
+POSTS = "/home/swhitton/local/big/blog"
+COMMENTS = "/home/swhitton/local/big/comments"
+# output
+ENTRIES = "/home/swhitton/src/wiki/blog/entry"
+IMAGES = "/home/swhitton/lib/wikiannex/blog/img"
+
+def strip_smarts(text):
+    return text.replace(u"“", "\"").replace(u"”", "\"").replace(u"’", "\'").replace(u"‘", "\'").replace(u"—", "---").replace(u"–", "--").replace(u"…", "...")
+
+def fix_images(text):
+    fixed = []
+
+    for line in text.splitlines():
+        match = re.match(r'\[!\[\]\(http://spw.sdf.org/blog/(.*)\)\]\(http://spw.sdf.org/blog/(.*)\)', line)
+        if match:
+            thumb = match.group(1)
+            image = match.group(2)
+            contents = os.listdir(os.path.join(POSTS, os.path.dirname(image)))
+            exts = map(lambda x: os.path.splitext(x)[1], contents)
+            if ".org" not in exts: # dedicated image dir
+                link_path = os.path.join(os.path.dirname(image).rsplit("/", 1)[1], os.path.basename(image))
+            else:
+                link_path = os.path.basename(image)
+
+            im = Image.open(os.path.join(POSTS, thumb))
+            im_width, im_height = im.size
+            dimensions = str(im_width) + "x" + str(im_height)
+            fixed.append("[[!img blog/img/" + link_path + " size=" + dimensions + "]]")
+        else:
+            fixed.append(line)
+
+    return "\n".join(fixed)
+
+def fix_more(text):
+    before, more, after = map(lambda s: s.strip(), text.partition("BREAK"))
+    if "\nBREAK\n" in text:
+        return "\n".join([before + "\n", "[[!more linktext=\"continue reading this entry\" pages=\"!blog/entry/*\" text=\"\"\"", after, "\"\"\"]]"])
+    elif " BREAK " in text:
+        return before + " [[!more linktext=\"continue reading this entry\" pages=\"!blog/entry/*\" text=\"\"\"" + after + "\n\"\"\"]]"
+    else:
+        return text
+
+def convert_post(post):
+    with open(post, 'r') as h:
+        org = h.read()
+
+    title = org.splitlines()[0].replace('#+HTML: ', '')
+    title = strip_smarts(title)
+    title = "[[!meta title=\"" + title + "\"]]"
+
+    date = org.splitlines()[1].replace('#+HTML: #published ', '')
+    date = "[[!meta date=\"" + date + "\"]]"
+
+    tags = os.path.dirname(post).replace(POSTS, "")[1:].replace("/", " ")
+    tags = "[[!tag  imported_PyBlosxom " + tags + "]]"
+
+    # this file generates a pandoc error:
+    # /home/swhitton/local/big/blog/linkdump/novdec14.org
+    pandoc = subprocess.Popen(["pandoc", "-f", "org", "-t", "markdown_strict"],
+                              stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    body, error = pandoc.communicate(input=org)
+
+    body = strip_smarts(body)
+    body = fix_images(body)
+    body = fix_more(body)
+
+    return "\n".join([date, title, tags, "", body])
+
+def convert_comment(comment):
+    # print "attempting to parse", comment
+    tree = ET.parse(comment)
+    root = tree.getroot()
+
+    # reference:
+
+    # [[!comment format=mdwn
+    #  username="spwhitton@171b57686690088a367b4b10ddf73c4ca6f16601"
+    #  nickname="spwhitton"
+    #  avatar="http://cdn.libravatar.org/avatar/40da86a5d03e6fa62515a9d762601ed2"
+    #  subject="And a second one, gravatar free"
+    #  date="2015-11-11T00:18:46Z"
+    #  content="""
+    # Here it is
+    # """]]
+
+    slug = os.path.basename(root.find('parent').text)
+    address = root.find('email')
+    if address == None:
+        username = root.find('author').text
+    else:
+        username = address.text.partition('@')[0]
+
+    desc = root.find('description').text
+    pandoc = subprocess.Popen(["pandoc", "-f", "html", "-t", "markdown_strict"],
+                              stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    desc, error = pandoc.communicate(input=desc)
+
+    comment = "\n".join([
+        "[[!comment format=mdwn",
+        " username=\"" + username  + "\"",
+        " nickname=\"" + root.find('author').text + "\"",
+        " date=\"" + root.find('w3cdate').text + "\"",
+        " content=\"\"\"",
+        desc + "\"\"\"]]"
+    ])
+
+    the_dir = os.path.join(ENTRIES, slug)
+    if not os.path.exists(the_dir):
+        os.mkdir(the_dir, 0755)
+    rands = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(16))
+    # ^ http://stackoverflow.com/a/2257449
+    if not comment.endswith("\n"):
+        comment = comment + "\n"
+    with open(os.path.join(ENTRIES, slug, "comment_" + rands + "._comment"), 'w') as h:
+        h.write(comment)
+
+def main():
+    for root, dirs, files in os.walk(POSTS):
+
+        # skip all the templates stored in root of blog
+        if root == POSTS or root.startswith("/home/swhitton/local/big/blog/.git"):
+            continue
+
+        # 1. If there's no .org in this dir and we're at the bottom of
+        # a tree, then it's a dir for images only, so copy it
+        # verbatim.  And we know from inspection with old Haskell
+        # script that there are no conflicts other than inside these
+        # image-only directories
+        exts = map(lambda x: os.path.splitext(x)[1], files)
+        if ".org" not in exts and not any(dirs):
+            dest = os.path.join(IMAGES, os.path.basename(root))
+            if not os.path.exists(dest):
+                shutil.copytree(root, dest)
+        # 2. now convert posts and images in the usual way
+        else:
+            for f in files:
+                ext = os.path.splitext(f)[1]
+                if ext == ".org":
+                    # convert_post, unlike convert_comment, relies on
+                    # us to decide where to save it
+                    post = convert_post(os.path.join(root, f))
+                    fname = os.path.join(ENTRIES, os.path.splitext(f)[0] + ".mdwn")
+                    if os.path.exists(fname): # safety if inspection wrong
+                        print "uh oh!  conflict!  " + fname + " exists!"
+                        sys.exit()
+                    else:
+                        if not post.endswith("\n"):
+                            post = post + "\n"
+                        with open(fname, 'w') as h:
+                            h.write(post)
+                elif "thumb." not in f:
+                    if os.path.exists(os.path.join(IMAGES, f)): # safety if inspection wrong
+                        print "uh oh!  conflict!  " + os.path.join(IMAGES, f) + " exists!"
+                        sys.exit()
+                    else:
+                        shutil.copy(os.path.join(root, f), IMAGES)
+
+    for root, dirs, files in os.walk(COMMENTS):
+        for f in files:
+            if os.path.splitext(f)[1] == ".cmt":
+                # convert_comment does the saving since the post to which
+                # the comment is associated is stored within the comment
+                convert_comment(os.path.join(root, f))
+
+if __name__ == "__main__":
+    main()
diff --git a/src/hscripts/hscripts.cabal b/src/hscripts/hscripts.cabal
index c73ca53a..aa047fee 100644
--- a/src/hscripts/hscripts.cabal
+++ b/src/hscripts/hscripts.cabal
@@ -18,3 +18,13 @@ executable video4ipad
                      , optparse-applicative
                      , process
   default-language:    Haskell2010
+
+executable orgblosxom2ikiwiki
+  hs-source-dirs:      src
+  main-is:             orgblosxom2ikiwiki.hs
+  build-depends:       base
+                     , MissingH
+                     , filemanip
+                     , directory
+                     , filepath
+  default-language:    Haskell2010
diff --git a/src/hscripts/src/orgblosxom2ikiwiki.hs b/src/hscripts/src/orgblosxom2ikiwiki.hs
new file mode 100644
index 00000000..81ee8701
--- /dev/null
+++ b/src/hscripts/src/orgblosxom2ikiwiki.hs
@@ -0,0 +1,94 @@
+-- 0. check for filename collisions and generate .htaccess entry
+-- 1. parse first two lines: produce date=, modified= and title= (second one just in case it can be made to take effect)
+-- 2. pass remainder through pandoc (with options to use old-style markdown: for example, need a blank line between a paragraph and a list)
+-- 3. fix MORE and links in bottom part, and add tags (and imported_pyblosxom tag)
+-- 4. put file into entries dir, and images into wikiannex
+
+-- There are no blog post filename conflicts.  There are the following attachment conflicts:
+
+-- ["/home/swhitton/local/big/blog/korea/epik/classroom.jpg"
+-- ,"/home/swhitton/local/big/blog/writing/diary/jhcoip/classroom.jpg"
+-- ,"/home/swhitton/local/big/blog/korea/epik/classroomthumb.jpg"
+-- ,"/home/swhitton/local/big/blog/writing/diary/jhcoip/classroomthumb.jpg"
+-- ,"/home/swhitton/local/big/blog/tech/emacs/org-mode/freemindeg.html_files/icons/help.png"
+-- ,"/home/swhitton/local/big/blog/oxford/oliscrot/help.png"
+-- ,"/home/swhitton/local/big/blog/korea/epik/temple.jpg"
+-- ,"/home/swhitton/local/big/blog/writing/diary/jhcoip/temple.jpg"
+-- ,"/home/swhitton/local/big/blog/korea/epik/templethumb.jpg"
+-- ,"/home/swhitton/local/big/blog/writing/diary/jhcoip/templethumb.jpg"]
+
+import           Data.String.Utils    (replace)
+import           System.FilePath.Find
+
+-- import           System.Directory
+-- import           System.FilePath
+
+-- import           Control.Monad
+-- import           Data.List
+
+convertEntry :: String -> String -> String
+convertEntry category source = unlines $ meta ++ tags ++ body
+  where
+    meta = parseMeta . take 2 . lines $ source
+    tags = parseCat category
+    body = convertBody . unlines . drop 2 . lines $ source
+
+-- work by replacing '/' with ' ' in strings like "/tech/gnu+linux"
+parseCat :: String -> [String]
+parseCat category = ("[[!tag" ++ (replace "/" " " category) ++ " ]]"):[]
+
+parseMeta :: [String] -> [String]
+parseMeta (title:date:[]) = ("[[!meta title=\"" ++ drop 7 title ++ "\"]]")
+                            : ("[[!meta date=\"" ++ drop 19 date ++ "\"]]")
+                            : []
+
+convertBody :: String -> [String]
+
+main = do
+    let rootDir = "/home/swhitton/doc/www/blog"
+    entries <- find (pure True) (extension ==? ".org") rootDir
+    attachments <- find (pure True) (not <$> extension ==? ".org") rootDir
+
+    undefined
+
+-- getDirectoryContentsRecursive :: FilePath -> IO [FilePath]
+-- getDirectoryContentsRecursive path = do
+--     names <- filter (`notElem` [".", "..", ".git"]) <$> getDirectoryContents path
+--     liftM concat $ forM names $ \name -> do
+--         dir <- doesDirectoryExist (path </> name)
+--         if dir
+--             then getDirectoryContentsRecursive (path </> name)
+--             else return [path </> name]
+
+-- checkForDuplicates :: IO ()
+-- checkForDuplicates = do
+--     names <- (++)
+--              <$> getDirectoryContentsRecursive "/home/swhitton/local/big/blog"
+--              <*> getDirectoryContentsRecursive "/home/swhitton/src/wiki/blog/entry"
+--     let posts = takeBaseName <$> filter ((`elem` [".org", ".mdwn"]) . takeExtension) names
+--     let other = filter ((`notElem` [".org", ".mdwn"]) . takeExtension) names
+--     let other' = takeBaseName <$> other
+--     -- let otherDupes = remNonDupes . sortOn takeBaseName $ other
+--     let otherDupes = sortOn takeBaseName [ x | x <- other, takeBaseName x `elem` (takeBaseName <$> (other `listMinus` x))]
+--     if nub posts == posts
+--         then putStrLn "no post filenames conflict" -- this gets printed
+--         else putStrLn "oh no!  post filenames conflict!"
+--     if nub other' == other'
+--         then putStrLn "no attachment filenames conflict"
+--         else putStrLn "oh no!  attachment filenames conflict!" -- this gets printed!
+--     putStrLn . show $ otherDupes
+
+-- remNonDupes :: (Eq a) => [a] -> [a]
+-- remNonDupes = remNonDupes' []
+
+-- remNonDupes' :: (Eq a) => [a] -> [a] -> [a]
+-- remNonDupes' ys [] = ys
+-- remNonDupes' ys (x:xs) = if x `elem` xs || x `elem` ys
+--                          then remNonDupes' (ys ++ [x]) xs
+--                          else remNonDupes' ys xs
+
+-- main :: IO ()
+-- main = checkForDuplicates
+
+-- listMinus :: (Eq a) => [a] -> a -> [a]
+-- listMinus (x:xs) y = if x == y then xs else x:listMinus xs y
author	Sean Whitton <spwhitton@spwhitton.name>	2015-11-18 10:57:49 -0700
committer	Sean Whitton <spwhitton@spwhitton.name>	2015-11-18 10:57:49 -0700
commit	96135c8fc043319b09d328ee432aafaa414a0bfd (patch)
tree	771787da09eccbb864be123f49ed6332435f0d7d
parent	dff3b59fe11bd00da9ecbc7dbfe804a61b0f100b (diff)
download	dotfiles-96135c8fc043319b09d328ee432aafaa414a0bfd.tar.gz