diff options
Diffstat (limited to 'Utility/Format.hs')
-rw-r--r-- | Utility/Format.hs | 185 |
1 files changed, 120 insertions, 65 deletions
diff --git a/Utility/Format.hs b/Utility/Format.hs index a2470fa..930b7ee 100644 --- a/Utility/Format.hs +++ b/Utility/Format.hs @@ -1,6 +1,6 @@ {- Formatted string handling. - - - Copyright 2010, 2011 Joey Hess <id@joeyh.name> + - Copyright 2010-2023 Joey Hess <id@joeyh.name> - - License: BSD-2-clause -} @@ -9,8 +9,12 @@ module Utility.Format ( Format, gen, format, + escapedFormat, + formatContainsVar, decode_c, encode_c, + encode_c', + isUtf8Byte, prop_encode_c_decode_c_roundtrip ) where @@ -19,19 +23,23 @@ import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord, isAscii import Data.Maybe (fromMaybe) import Data.Word (Word8) import Data.List (isPrefixOf) -import qualified Codec.Binary.UTF8.String import qualified Data.Map as M +import qualified Data.ByteString as S import Utility.PartialPrelude - -type FormatString = String +import Utility.FileSystemEncoding {- A format consists of a list of fragments. -} type Format = [Frag] -{- A fragment is either a constant string, - - or a variable, with a justification. -} -data Frag = Const String | Var String Justify +{- A fragment is either a constant string, or a variable. -} +data Frag + = Const String + | Var + { varName :: String + , varJustify :: Justify + , varEscaped :: Bool + } deriving (Show) data Justify = LeftJustified Int | RightJustified Int | UnJustified @@ -45,10 +53,9 @@ format :: Format -> Variables -> String format f vars = concatMap expand f where expand (Const s) = s - expand (Var name j) - | "escaped_" `isPrefixOf` name = - justify j $ encode_c_strict $ - getvar $ drop (length "escaped_") name + expand (Var name j esc) + | esc = justify j $ decodeBS $ escapedFormat $ + encodeBS $ getvar name | otherwise = justify j $ getvar name getvar name = fromMaybe "" $ M.lookup name vars justify UnJustified s = s @@ -57,13 +64,22 @@ format f vars = concatMap expand f pad i s = take (i - length s) spaces spaces = repeat ' ' +escapedFormat :: S.ByteString -> S.ByteString +escapedFormat = encode_c needescape + where + needescape c = isUtf8Byte c || + isSpace (chr (fromIntegral c)) || + c == fromIntegral (ord '"') + {- Generates a Format that can be used to expand variables in a - format string, such as "${foo} ${bar;10} ${baz;-10}\n" - - (This is the same type of format string used by dpkg-query.) + - + - Also, "${escaped_foo}" will apply encode_c to the value of variable foo. -} -gen :: FormatString -> Format -gen = filter (not . empty) . fuse [] . scan [] . decode_c +gen :: String -> Format +gen = filter (not . empty) . fuse [] . scan [] . decodeBS . decode_c . encodeBS where -- The Format is built up in reverse, for efficiency, -- and can have many adjacent Consts. Fusing it fixes both @@ -94,42 +110,71 @@ gen = filter (not . empty) . fuse [] . scan [] . decode_c | i < 0 = LeftJustified (-1 * i) | otherwise = RightJustified i novar v = "${" ++ reverse v - foundvar f v p = scan (Var (reverse v) p : f) + foundvar f varname_r p = + let varname = reverse varname_r + var = if "escaped_" `isPrefixOf` varname + then Var (drop (length "escaped_") varname) p True + else Var varname p False + in scan (var : f) empty :: Frag -> Bool empty (Const "") = True empty _ = False +{- Check if a Format contains a variable with a specified name. -} +formatContainsVar :: String -> Format -> Bool +formatContainsVar v = any go + where + go (Var v' _ _) | v' == v = True + go _ = False + {- Decodes a C-style encoding, where \n is a newline (etc), - \NNN is an octal encoded character, and \xNN is a hex encoded character. -} -decode_c :: FormatString -> String -decode_c [] = [] -decode_c s = unescape ("", s) +decode_c :: S.ByteString -> S.ByteString +decode_c s + | S.null s = S.empty + | otherwise = unescape (S.empty, s) where - e = '\\' - unescape (b, []) = b - -- look for escapes starting with '\' - unescape (b, v) = b ++ fst pair ++ unescape (handle $ snd pair) + e = fromIntegral (ord '\\') + x = fromIntegral (ord 'x') + isescape c = c == e + unescape (b, v) + | S.null v = b + | otherwise = b <> fst pair <> unescape (handle $ snd pair) where - pair = span (/= e) v - isescape x = x == e - handle (x:'x':n1:n2:rest) - | isescape x && allhex = (fromhex, rest) + pair = S.span (not . isescape) v + handle b + | S.length b >= 1 && isescape (S.index b 0) = handle' b + | otherwise = (S.empty, b) + + handle' b + | S.length b >= 4 + && S.index b 1 == x + && allhex = (fromhex, rest) where + n1 = chr (fromIntegral (S.index b 2)) + n2 = chr (fromIntegral (S.index b 3)) + rest = S.drop 4 b allhex = isHexDigit n1 && isHexDigit n2 - fromhex = [chr $ readhex [n1, n2]] + fromhex = encodeBS [chr $ readhex [n1, n2]] readhex h = Prelude.read $ "0x" ++ h :: Int - handle (x:n1:n2:n3:rest) - | isescape x && alloctal = (fromoctal, rest) + handle' b + | S.length b >= 4 && alloctal = (fromoctal, rest) where + n1 = chr (fromIntegral (S.index b 1)) + n2 = chr (fromIntegral (S.index b 2)) + n3 = chr (fromIntegral (S.index b 3)) + rest = S.drop 4 b alloctal = isOctDigit n1 && isOctDigit n2 && isOctDigit n3 - fromoctal = [chr $ readoctal [n1, n2, n3]] + fromoctal = encodeBS [chr $ readoctal [n1, n2, n3]] readoctal o = Prelude.read $ "0o" ++ o :: Int - -- \C is used for a few special characters - handle (x:nc:rest) - | isescape x = ([echar nc], rest) + handle' b + | S.length b >= 2 = + (S.singleton (fromIntegral (ord (echar nc))), rest) where + nc = chr (fromIntegral (S.index b 1)) + rest = S.drop 2 b echar 'a' = '\a' echar 'b' = '\b' echar 'f' = '\f' @@ -137,41 +182,50 @@ decode_c s = unescape ("", s) echar 'r' = '\r' echar 't' = '\t' echar 'v' = '\v' - echar a = a - handle n = ("", n) + echar a = a -- \\ decodes to '\', and \" to '"' + handle' b = (S.empty, b) -{- Inverse of decode_c. -} -encode_c :: String -> FormatString -encode_c = encode_c' (const False) - -{- Encodes more strictly, including whitespace. -} -encode_c_strict :: String -> FormatString -encode_c_strict = encode_c' isSpace - -encode_c' :: (Char -> Bool) -> String -> FormatString -encode_c' p = concatMap echar +{- Inverse of decode_c. Encodes ascii control characters as well as + - bytes that match the predicate. (And also '\' itself.) + -} +encode_c :: (Word8 -> Bool) -> S.ByteString -> S.ByteString +encode_c p s = fromMaybe s (encode_c' p s) + +{- Returns Nothing when nothing needs to be escaped in the input ByteString. -} +encode_c' :: (Word8 -> Bool) -> S.ByteString -> Maybe S.ByteString +encode_c' p s + | S.any needencode s = Just (S.concatMap echar s) + | otherwise = Nothing where - e c = '\\' : [c] - echar '\a' = e 'a' - echar '\b' = e 'b' - echar '\f' = e 'f' - echar '\n' = e 'n' - echar '\r' = e 'r' - echar '\t' = e 't' - echar '\v' = e 'v' - echar '\\' = e '\\' - echar '"' = e '"' + e = fromIntegral (ord '\\') + q = fromIntegral (ord '"') + del = 0x7F + iscontrol c = c < 0x20 + + echar 0x7 = ec 'a' + echar 0x8 = ec 'b' + echar 0x0C = ec 'f' + echar 0x0A = ec 'n' + echar 0x0D = ec 'r' + echar 0x09 = ec 't' + echar 0x0B = ec 'v' echar c - | ord c < 0x20 = e_asc c -- low ascii - | ord c >= 256 = e_utf c -- unicode - | ord c > 0x7E = e_asc c -- high ascii - | p c = e_asc c -- unprintable ascii - | otherwise = [c] -- printable ascii - -- unicode character is decomposed to individual Word8s, - -- and each is shown in octal - e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8]) - e_asc c = showoctal $ ord c - showoctal i = '\\' : printf "%03o" i + | iscontrol c = showoctal c -- other control characters + | c == e = ec '\\' -- escape the escape character itself + | c == del = showoctal c + | p c = if c == q + then ec '"' -- escape double quote + else showoctal c + | otherwise = S.singleton c + + needencode c = iscontrol c || c == e || c == del || p c + + ec c = S.pack [e, fromIntegral (ord c)] + + showoctal i = encodeBS ('\\' : printf "%03o" i) + +isUtf8Byte :: Word8 -> Bool +isUtf8Byte c = c >= 0x80 {- For quickcheck. - @@ -182,6 +236,7 @@ encode_c' p = concatMap echar - This property papers over the problem, by only testing ascii. -} prop_encode_c_decode_c_roundtrip :: String -> Bool -prop_encode_c_decode_c_roundtrip s = s' == decode_c (encode_c s') +prop_encode_c_decode_c_roundtrip s = s' == + decodeBS (decode_c (encode_c isUtf8Byte (encodeBS s'))) where s' = filter isAscii s |