1 files changed, 120 insertions, 65 deletions
diff --git a/Utility/Format.hs b/Utility/Format.hs
index a2470fa..930b7ee 100644
--- a/Utility/Format.hs
+++ b/Utility/Format.hs
@@ -1,6 +1,6 @@
 {- Formatted string handling.
  -
- - Copyright 2010, 2011 Joey Hess <id@joeyh.name>
+ - Copyright 2010-2023 Joey Hess <id@joeyh.name>
  -
  - License: BSD-2-clause
  -}
@@ -9,8 +9,12 @@ module Utility.Format (
 	Format,
 	gen,
 	format,
+	escapedFormat,
+	formatContainsVar,
 	decode_c,
 	encode_c,
+	encode_c',
+	isUtf8Byte,
 	prop_encode_c_decode_c_roundtrip
 ) where
 
@@ -19,19 +23,23 @@ import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord, isAscii
 import Data.Maybe (fromMaybe)
 import Data.Word (Word8)
 import Data.List (isPrefixOf)
-import qualified Codec.Binary.UTF8.String
 import qualified Data.Map as M
+import qualified Data.ByteString as S
 
 import Utility.PartialPrelude
-
-type FormatString = String
+import Utility.FileSystemEncoding
 
 {- A format consists of a list of fragments. -}
 type Format = [Frag]
 
-{- A fragment is either a constant string,
- - or a variable, with a justification. -}
-data Frag = Const String | Var String Justify
+{- A fragment is either a constant string, or a variable. -}
+data Frag
+	= Const String 
+	| Var
+		{ varName :: String
+		, varJustify :: Justify
+		, varEscaped :: Bool
+		}
 	deriving (Show)
 
 data Justify = LeftJustified Int | RightJustified Int | UnJustified
@@ -45,10 +53,9 @@ format :: Format -> Variables -> String
 format f vars = concatMap expand f
   where
 	expand (Const s) = s
-	expand (Var name j)
-		| "escaped_" `isPrefixOf` name =
-			justify j $ encode_c_strict $
-				getvar $ drop (length "escaped_") name
+	expand (Var name j esc)
+		| esc = justify j $ decodeBS $ escapedFormat $
+			encodeBS $ getvar name
 		| otherwise = justify j $ getvar name
 	getvar name = fromMaybe "" $ M.lookup name vars
 	justify UnJustified s        = s
@@ -57,13 +64,22 @@ format f vars = concatMap expand f
 	pad i s = take (i - length s) spaces
 	spaces = repeat ' '
 
+escapedFormat :: S.ByteString -> S.ByteString
+escapedFormat = encode_c needescape
+  where
+	needescape c = isUtf8Byte c ||
+		isSpace (chr (fromIntegral c)) ||
+		c == fromIntegral (ord '"')
+
 {- Generates a Format that can be used to expand variables in a
  - format string, such as "${foo} ${bar;10} ${baz;-10}\n"
  -
  - (This is the same type of format string used by dpkg-query.)
+ -
+ - Also, "${escaped_foo}" will apply encode_c to the value of variable foo.
  -}
-gen :: FormatString -> Format
-gen = filter (not . empty) . fuse [] . scan [] . decode_c
+gen :: String -> Format
+gen = filter (not . empty) . fuse [] . scan [] . decodeBS . decode_c . encodeBS
   where
 	-- The Format is built up in reverse, for efficiency,
 	-- and can have many adjacent Consts. Fusing it fixes both
@@ -94,42 +110,71 @@ gen = filter (not . empty) . fuse [] . scan [] . decode_c
 		| i < 0 = LeftJustified (-1 * i)
 		| otherwise = RightJustified i
 	novar v = "${" ++ reverse v
-	foundvar f v p = scan (Var (reverse v) p : f)
+	foundvar f varname_r p = 
+		let varname = reverse varname_r
+		    var = if "escaped_" `isPrefixOf` varname
+			then Var (drop (length "escaped_") varname) p True
+			else Var varname p False
+		in scan (var : f)
 
 empty :: Frag -> Bool
 empty (Const "") = True
 empty _ = False
 
+{- Check if a Format contains a variable with a specified name. -}
+formatContainsVar :: String -> Format -> Bool
+formatContainsVar v = any go
+  where
+	go (Var v' _ _) | v' == v = True
+	go _ = False
+
 {- Decodes a C-style encoding, where \n is a newline (etc),
  - \NNN is an octal encoded character, and \xNN is a hex encoded character.
  -}
-decode_c :: FormatString -> String
-decode_c [] = []
-decode_c s = unescape ("", s)
+decode_c :: S.ByteString -> S.ByteString
+decode_c s
+	| S.null s = S.empty
+	| otherwise = unescape (S.empty, s)
   where
-	e = '\\'
-	unescape (b, []) = b
-	-- look for escapes starting with '\'
-	unescape (b, v) = b ++ fst pair ++ unescape (handle $ snd pair)
+	e = fromIntegral (ord '\\')
+	x = fromIntegral (ord 'x')
+	isescape c = c == e
+	unescape (b, v)
+		| S.null v = b
+		| otherwise = b <> fst pair <> unescape (handle $ snd pair)
 	  where
-		pair = span (/= e) v
-	isescape x = x == e
-	handle (x:'x':n1:n2:rest)
-		| isescape x && allhex = (fromhex, rest)
+		pair = S.span (not . isescape) v
+	handle b
+		| S.length b >= 1 && isescape (S.index b 0) = handle' b
+		| otherwise = (S.empty, b)
+	
+	handle' b
+		| S.length b >= 4
+			&& S.index b 1 == x
+			&& allhex = (fromhex, rest)
 	  where
+		n1 = chr (fromIntegral (S.index b 2))
+		n2 = chr (fromIntegral (S.index b 3))
+	  	rest = S.drop 4 b
 		allhex = isHexDigit n1 && isHexDigit n2
-		fromhex = [chr $ readhex [n1, n2]]
+		fromhex = encodeBS [chr $ readhex [n1, n2]]
 		readhex h = Prelude.read $ "0x" ++ h :: Int
-	handle (x:n1:n2:n3:rest)
-		| isescape x && alloctal = (fromoctal, rest)
+	handle' b
+		| S.length b >= 4 && alloctal = (fromoctal, rest)
 	  where
+		n1 = chr (fromIntegral (S.index b 1))
+		n2 = chr (fromIntegral (S.index b 2))
+		n3 = chr (fromIntegral (S.index b 3))
+	  	rest = S.drop 4 b
 		alloctal = isOctDigit n1 && isOctDigit n2 && isOctDigit n3
-		fromoctal = [chr $ readoctal [n1, n2, n3]]
+		fromoctal = encodeBS [chr $ readoctal [n1, n2, n3]]
 		readoctal o = Prelude.read $ "0o" ++ o :: Int
-	-- \C is used for a few special characters
-	handle (x:nc:rest)
-		| isescape x = ([echar nc], rest)
+	handle' b
+		| S.length b >= 2 = 
+			(S.singleton (fromIntegral (ord (echar nc))), rest)
 	  where
+		nc = chr (fromIntegral (S.index b 1))
+		rest = S.drop 2 b
 		echar 'a' = '\a'
 		echar 'b' = '\b'
 		echar 'f' = '\f'
@@ -137,41 +182,50 @@ decode_c s = unescape ("", s)
 		echar 'r' = '\r'
 		echar 't' = '\t'
 		echar 'v' = '\v'
-		echar a = a
-	handle n = ("", n)
+		echar a = a -- \\ decodes to '\', and \" to '"'
+	handle' b = (S.empty, b)
 
-{- Inverse of decode_c. -}
-encode_c :: String -> FormatString
-encode_c = encode_c' (const False)
-
-{- Encodes more strictly, including whitespace. -}
-encode_c_strict :: String -> FormatString
-encode_c_strict = encode_c' isSpace
-
-encode_c' :: (Char -> Bool) -> String -> FormatString
-encode_c' p = concatMap echar
+{- Inverse of decode_c. Encodes ascii control characters as well as
+ - bytes that match the predicate. (And also '\' itself.)
+ -}
+encode_c :: (Word8 -> Bool) -> S.ByteString -> S.ByteString
+encode_c p s = fromMaybe s (encode_c' p s)
+
+{- Returns Nothing when nothing needs to be escaped in the input ByteString. -}
+encode_c' :: (Word8 -> Bool) -> S.ByteString -> Maybe S.ByteString
+encode_c' p s
+	| S.any needencode s = Just (S.concatMap echar s)
+	| otherwise = Nothing
   where
-	e c = '\\' : [c]
-	echar '\a' = e 'a'
-	echar '\b' = e 'b'
-	echar '\f' = e 'f'
-	echar '\n' = e 'n'
-	echar '\r' = e 'r'
-	echar '\t' = e 't'
-	echar '\v' = e 'v'
-	echar '\\' = e '\\'
-	echar '"'  = e '"'
+	e = fromIntegral (ord '\\')
+	q = fromIntegral (ord '"')
+	del = 0x7F
+	iscontrol c = c < 0x20
+
+	echar 0x7 = ec 'a'
+	echar 0x8 = ec 'b'
+	echar 0x0C = ec 'f'
+	echar 0x0A = ec 'n'
+	echar 0x0D = ec 'r'
+	echar 0x09 = ec 't'
+	echar 0x0B = ec 'v'
 	echar c
-		| ord c < 0x20 = e_asc c -- low ascii
-		| ord c >= 256 = e_utf c -- unicode
-		| ord c > 0x7E = e_asc c -- high ascii
-		| p c          = e_asc c -- unprintable ascii
-		| otherwise    = [c]     -- printable ascii
-	-- unicode character is decomposed to individual Word8s,
-	-- and each is shown in octal
-	e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8])
-	e_asc c = showoctal $ ord c
-	showoctal i = '\\' : printf "%03o" i
+		| iscontrol c = showoctal c -- other control characters
+		| c == e = ec '\\' -- escape the escape character itself
+		| c == del = showoctal c
+		| p c = if c == q
+			then ec '"' -- escape double quote
+			else showoctal c
+		| otherwise = S.singleton c
+	
+	needencode c = iscontrol c || c == e || c == del || p c
+
+	ec c = S.pack [e, fromIntegral (ord c)]
+
+	showoctal i = encodeBS ('\\' : printf "%03o" i)
+
+isUtf8Byte :: Word8 -> Bool
+isUtf8Byte c = c >= 0x80
 
 {- For quickcheck. 
  -
@@ -182,6 +236,7 @@ encode_c' p = concatMap echar
  - This property papers over the problem, by only testing ascii.
  -}
 prop_encode_c_decode_c_roundtrip :: String -> Bool
-prop_encode_c_decode_c_roundtrip s = s' == decode_c (encode_c s')
+prop_encode_c_decode_c_roundtrip s = s' == 
+	decodeBS (decode_c (encode_c isUtf8Byte (encodeBS s')))
   where
 	s' = filter isAscii s