merge from git-annex

Support building with unix-compat 0.7
author: Joey Hess <joeyh@joeyh.name> 2023-08-14 12:06:32 -0400
committer: Joey Hess <joeyh@joeyh.name> 2023-08-14 12:12:52 -0400
commit: edf83982be214f3c839fab9b659f645de53a9100 (patch)
tree: bef06cb750379c6d7942fc13b13fcb328201354c /Utility/Format.hs
parent: f0cd3a2a3758ddcd2f0900c16bdc1fb80bbd6e92 (diff)
download: git-repair-edf83982be214f3c839fab9b659f645de53a9100.tar.gz
1 files changed, 94 insertions, 55 deletions
diff --git a/Utility/Format.hs b/Utility/Format.hs
index 466988c..930b7ee 100644
--- a/Utility/Format.hs
+++ b/Utility/Format.hs
@@ -1,6 +1,6 @@
 {- Formatted string handling.
  -
- - Copyright 2010-2020 Joey Hess <id@joeyh.name>
+ - Copyright 2010-2023 Joey Hess <id@joeyh.name>
  -
  - License: BSD-2-clause
  -}
@@ -9,10 +9,12 @@ module Utility.Format (
 	Format,
 	gen,
 	format,
+	escapedFormat,
 	formatContainsVar,
 	decode_c,
 	encode_c,
 	encode_c',
+	isUtf8Byte,
 	prop_encode_c_decode_c_roundtrip
 ) where
 
@@ -21,12 +23,11 @@ import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord, isAscii
 import Data.Maybe (fromMaybe)
 import Data.Word (Word8)
 import Data.List (isPrefixOf)
-import qualified Codec.Binary.UTF8.String
 import qualified Data.Map as M
+import qualified Data.ByteString as S
 
 import Utility.PartialPrelude
-
-type FormatString = String
+import Utility.FileSystemEncoding
 
 {- A format consists of a list of fragments. -}
 type Format = [Frag]
@@ -53,7 +54,8 @@ format f vars = concatMap expand f
   where
 	expand (Const s) = s
 	expand (Var name j esc)
-		| esc = justify j $ encode_c' isSpace $ getvar name
+		| esc = justify j $ decodeBS $ escapedFormat $
+			encodeBS $ getvar name
 		| otherwise = justify j $ getvar name
 	getvar name = fromMaybe "" $ M.lookup name vars
 	justify UnJustified s        = s
@@ -62,6 +64,13 @@ format f vars = concatMap expand f
 	pad i s = take (i - length s) spaces
 	spaces = repeat ' '
 
+escapedFormat :: S.ByteString -> S.ByteString
+escapedFormat = encode_c needescape
+  where
+	needescape c = isUtf8Byte c ||
+		isSpace (chr (fromIntegral c)) ||
+		c == fromIntegral (ord '"')
+
 {- Generates a Format that can be used to expand variables in a
  - format string, such as "${foo} ${bar;10} ${baz;-10}\n"
  -
@@ -69,8 +78,8 @@ format f vars = concatMap expand f
  -
  - Also, "${escaped_foo}" will apply encode_c to the value of variable foo.
  -}
-gen :: FormatString -> Format
-gen = filter (not . empty) . fuse [] . scan [] . decode_c
+gen :: String -> Format
+gen = filter (not . empty) . fuse [] . scan [] . decodeBS . decode_c . encodeBS
   where
 	-- The Format is built up in reverse, for efficiency,
 	-- and can have many adjacent Consts. Fusing it fixes both
@@ -122,33 +131,50 @@ formatContainsVar v = any go
 {- Decodes a C-style encoding, where \n is a newline (etc),
  - \NNN is an octal encoded character, and \xNN is a hex encoded character.
  -}
-decode_c :: FormatString -> String
-decode_c [] = []
-decode_c s = unescape ("", s)
+decode_c :: S.ByteString -> S.ByteString
+decode_c s
+	| S.null s = S.empty
+	| otherwise = unescape (S.empty, s)
   where
-	e = '\\'
-	unescape (b, []) = b
-	-- look for escapes starting with '\'
-	unescape (b, v) = b ++ fst pair ++ unescape (handle $ snd pair)
+	e = fromIntegral (ord '\\')
+	x = fromIntegral (ord 'x')
+	isescape c = c == e
+	unescape (b, v)
+		| S.null v = b
+		| otherwise = b <> fst pair <> unescape (handle $ snd pair)
 	  where
-		pair = span (/= e) v
-	isescape x = x == e
-	handle (x:'x':n1:n2:rest)
-		| isescape x && allhex = (fromhex, rest)
+		pair = S.span (not . isescape) v
+	handle b
+		| S.length b >= 1 && isescape (S.index b 0) = handle' b
+		| otherwise = (S.empty, b)
+	
+	handle' b
+		| S.length b >= 4
+			&& S.index b 1 == x
+			&& allhex = (fromhex, rest)
 	  where
+		n1 = chr (fromIntegral (S.index b 2))
+		n2 = chr (fromIntegral (S.index b 3))
+	  	rest = S.drop 4 b
 		allhex = isHexDigit n1 && isHexDigit n2
-		fromhex = [chr $ readhex [n1, n2]]
+		fromhex = encodeBS [chr $ readhex [n1, n2]]
 		readhex h = Prelude.read $ "0x" ++ h :: Int
-	handle (x:n1:n2:n3:rest)
-		| isescape x && alloctal = (fromoctal, rest)
+	handle' b
+		| S.length b >= 4 && alloctal = (fromoctal, rest)
 	  where
+		n1 = chr (fromIntegral (S.index b 1))
+		n2 = chr (fromIntegral (S.index b 2))
+		n3 = chr (fromIntegral (S.index b 3))
+	  	rest = S.drop 4 b
 		alloctal = isOctDigit n1 && isOctDigit n2 && isOctDigit n3
-		fromoctal = [chr $ readoctal [n1, n2, n3]]
+		fromoctal = encodeBS [chr $ readoctal [n1, n2, n3]]
 		readoctal o = Prelude.read $ "0o" ++ o :: Int
-	-- \C is used for a few special characters
-	handle (x:nc:rest)
-		| isescape x = ([echar nc], rest)
+	handle' b
+		| S.length b >= 2 = 
+			(S.singleton (fromIntegral (ord (echar nc))), rest)
 	  where
+		nc = chr (fromIntegral (S.index b 1))
+		rest = S.drop 2 b
 		echar 'a' = '\a'
 		echar 'b' = '\b'
 		echar 'f' = '\f'
@@ -156,38 +182,50 @@ decode_c s = unescape ("", s)
 		echar 'r' = '\r'
 		echar 't' = '\t'
 		echar 'v' = '\v'
-		echar a = a
-	handle n = ("", n)
-
-{- Inverse of decode_c. -}
-encode_c :: String -> FormatString
-encode_c = encode_c' (const False)
+		echar a = a -- \\ decodes to '\', and \" to '"'
+	handle' b = (S.empty, b)
 
-{- Encodes special characters, as well as any matching the predicate. -}
-encode_c' :: (Char -> Bool) -> String -> FormatString
-encode_c' p = concatMap echar
+{- Inverse of decode_c. Encodes ascii control characters as well as
+ - bytes that match the predicate. (And also '\' itself.)
+ -}
+encode_c :: (Word8 -> Bool) -> S.ByteString -> S.ByteString
+encode_c p s = fromMaybe s (encode_c' p s)
+
+{- Returns Nothing when nothing needs to be escaped in the input ByteString. -}
+encode_c' :: (Word8 -> Bool) -> S.ByteString -> Maybe S.ByteString
+encode_c' p s
+	| S.any needencode s = Just (S.concatMap echar s)
+	| otherwise = Nothing
   where
-	e c = '\\' : [c]
-	echar '\a' = e 'a'
-	echar '\b' = e 'b'
-	echar '\f' = e 'f'
-	echar '\n' = e 'n'
-	echar '\r' = e 'r'
-	echar '\t' = e 't'
-	echar '\v' = e 'v'
-	echar '\\' = e '\\'
-	echar '"'  = e '"'
+	e = fromIntegral (ord '\\')
+	q = fromIntegral (ord '"')
+	del = 0x7F
+	iscontrol c = c < 0x20
+
+	echar 0x7 = ec 'a'
+	echar 0x8 = ec 'b'
+	echar 0x0C = ec 'f'
+	echar 0x0A = ec 'n'
+	echar 0x0D = ec 'r'
+	echar 0x09 = ec 't'
+	echar 0x0B = ec 'v'
 	echar c
-		| ord c < 0x20 = e_asc c -- low ascii
-		| ord c >= 256 = e_utf c -- unicode
-		| ord c > 0x7E = e_asc c -- high ascii
-		| p c          = e_asc c
-		| otherwise    = [c]
-	-- unicode character is decomposed to individual Word8s,
-	-- and each is shown in octal
-	e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8])
-	e_asc c = showoctal $ ord c
-	showoctal i = '\\' : printf "%03o" i
+		| iscontrol c = showoctal c -- other control characters
+		| c == e = ec '\\' -- escape the escape character itself
+		| c == del = showoctal c
+		| p c = if c == q
+			then ec '"' -- escape double quote
+			else showoctal c
+		| otherwise = S.singleton c
+	
+	needencode c = iscontrol c || c == e || c == del || p c
+
+	ec c = S.pack [e, fromIntegral (ord c)]
+
+	showoctal i = encodeBS ('\\' : printf "%03o" i)
+
+isUtf8Byte :: Word8 -> Bool
+isUtf8Byte c = c >= 0x80
 
 {- For quickcheck. 
  -
@@ -198,6 +236,7 @@ encode_c' p = concatMap echar
  - This property papers over the problem, by only testing ascii.
  -}
 prop_encode_c_decode_c_roundtrip :: String -> Bool
-prop_encode_c_decode_c_roundtrip s = s' == decode_c (encode_c s')
+prop_encode_c_decode_c_roundtrip s = s' == 
+	decodeBS (decode_c (encode_c isUtf8Byte (encodeBS s')))
   where
 	s' = filter isAscii s
author	Joey Hess <joeyh@joeyh.name>	2023-08-14 12:06:32 -0400
committer	Joey Hess <joeyh@joeyh.name>	2023-08-14 12:12:52 -0400
commit	edf83982be214f3c839fab9b659f645de53a9100 (patch)
tree	bef06cb750379c6d7942fc13b13fcb328201354c /Utility/Format.hs
parent	f0cd3a2a3758ddcd2f0900c16bdc1fb80bbd6e92 (diff)
download	git-repair-edf83982be214f3c839fab9b659f645de53a9100.tar.gz