utf8-string-1.0.2/0000755000000000000000000000000007346545000012075 5ustar0000000000000000utf8-string-1.0.2/CHANGELOG.markdown0000755000000000000000000000060607346545000015135 0ustar00000000000000001.0.2 ----- * Add fromChar * Add IsString UTF8 instance * Fixup documentation and tests 1.0.1.1 ----- * Build correctly on GHC-7.0 (#14) 1.0.1 ----- * Improve the performance of Data.ByteString.Lazy.UTF8.fromString. (Thanks, ndmitchell) 1 ----- * Remove out all the old utf8 IO support. GHC supports utf8 now. 0.3.8 ----- * Performance tweaks * bytestring-in-base flag default to False utf8-string-1.0.2/Codec/Binary/UTF8/0000755000000000000000000000000007346545000015064 5ustar0000000000000000utf8-string-1.0.2/Codec/Binary/UTF8/Generic.hs0000644000000000000000000002514307346545000017001 0ustar0000000000000000{-# LANGUAGE CPP, MultiParamTypeClasses, FunctionalDependencies, FlexibleInstances #-} #if __GLASGOW_HASKELL__ >= 701 {-# LANGUAGE Trustworthy #-} #endif -- -- | -- Module : Codec.Binary.UTF8.Generic -- Copyright : (c) Iavor S. Diatchki 2009 -- License : BSD3-style (see LICENSE) -- -- Maintainer : emertens@galois.com -- Stability : experimental -- Portability : portable -- module Codec.Binary.UTF8.Generic ( UTF8Bytes(..) , decode , replacement_char , uncons , splitAt , take , drop , span , break , fromString , toString , foldl , foldr , length , lines , lines' ) where import Data.Bits import Data.Int import Data.Word import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as L import qualified Data.List as List import Prelude hiding (take,drop,splitAt,span,break,foldr,foldl,length,lines,null,tail) import Codec.Binary.UTF8.String(encode) #ifdef BYTESTRING_IN_BASE import Data.ByteString.Base (unsafeHead, unsafeTail) #endif class (Num s, Ord s) => UTF8Bytes b s | b -> s where bsplit :: s -> b -> (b,b) bdrop :: s -> b -> b buncons :: b -> Maybe (Word8,b) elemIndex :: Word8 -> b -> Maybe s empty :: b null :: b -> Bool pack :: [Word8] -> b tail :: b -> b instance UTF8Bytes B.ByteString Int where bsplit = B.splitAt bdrop = B.drop buncons = unconsB elemIndex = B.elemIndex empty = B.empty null = B.null pack = B.pack tail = B.tail instance UTF8Bytes L.ByteString Int64 where bsplit = L.splitAt bdrop = L.drop buncons = unconsL elemIndex = L.elemIndex empty = L.empty null = L.null pack = L.pack tail = L.tail instance UTF8Bytes [Word8] Int where bsplit = List.splitAt bdrop = List.drop buncons (x:xs) = Just (x,xs) buncons [] = Nothing elemIndex x xs = List.elemIndex (toEnum (fromEnum x)) xs empty = [] null = List.null pack = id tail = List.tail -- | Converts a Haskell string into a UTF8 encoded bytestring. {-# SPECIALIZE fromString :: String -> B.ByteString #-} {-# SPECIALIZE fromString :: String -> L.ByteString #-} {-# SPECIALIZE fromString :: String -> [Word8] #-} fromString :: UTF8Bytes b s => String -> b fromString xs = pack (encode xs) -- | Convert a UTF8 encoded bytestring into a Haskell string. -- Invalid characters are replaced with @\'\\0xFFFD\'@. {-# SPECIALIZE toString :: B.ByteString -> String #-} {-# SPECIALIZE toString :: L.ByteString -> String #-} {-# SPECIALIZE toString :: [Word8] -> String #-} toString :: UTF8Bytes b s => b -> String toString bs = foldr (:) [] bs -- | This character is used to mark errors in a UTF8 encoded string. replacement_char :: Char replacement_char = '\xfffd' -- | Try to extract a character from a byte string. -- Returns 'Nothing' if there are no more bytes in the byte string. -- Otherwise, it returns a decoded character and the number of -- bytes used in its representation. -- Errors are replaced by character @\'\\0xFFFD\'@. -- XXX: Should we combine sequences of errors into a single replacement -- character? {-# SPECIALIZE decode :: B.ByteString -> Maybe (Char,Int) #-} {-# SPECIALIZE decode :: L.ByteString -> Maybe (Char,Int64) #-} {-# SPECIALIZE decode :: [Word8] -> Maybe (Char,Int) #-} decode :: UTF8Bytes b s => b -> Maybe (Char,s) decode bs = do (c,cs) <- buncons bs return (choose (fromEnum c) cs) where choose c cs | c < 0x80 = (toEnum $ fromEnum c, 1) | c < 0xc0 = (replacement_char, 1) | c < 0xe0 = bytes2 (mask c 0x1f) cs | c < 0xf0 = bytes3 (mask c 0x0f) cs | c < 0xf8 = bytes4 (mask c 0x07) cs | otherwise = (replacement_char, 1) mask c m = fromEnum (c .&. m) combine acc r = shiftL acc 6 .|. fromEnum (r .&. 0x3f) follower acc r | r .&. 0xc0 == 0x80 = Just (combine acc r) follower _ _ = Nothing {-# INLINE get_follower #-} get_follower acc cs = do (x,xs) <- buncons cs acc1 <- follower acc x return (acc1,xs) bytes2 c cs = case get_follower c cs of Just (d, _) | d >= 0x80 -> (toEnum d, 2) | otherwise -> (replacement_char, 1) _ -> (replacement_char, 1) bytes3 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d, _) | (d >= 0x800 && d < 0xd800) || (d > 0xdfff && d < 0xfffe) -> (toEnum d, 3) | otherwise -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) bytes4 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d2, cs2) -> case get_follower d2 cs2 of Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4) | otherwise -> (replacement_char, 4) _ -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) -- | Split after a given number of characters. -- Negative values are treated as if they are 0. {-# SPECIALIZE splitAt :: Int -> B.ByteString -> (B.ByteString,B.ByteString) #-} {-# SPECIALIZE splitAt :: Int64 -> L.ByteString -> (L.ByteString,L.ByteString) #-} {-# SPECIALIZE splitAt :: Int -> [Word8] -> ([Word8],[Word8]) #-} splitAt :: UTF8Bytes b s => s -> b -> (b,b) splitAt x bs = loop 0 x bs where loop a n _ | n <= 0 = bsplit a bs loop a n bs1 = case decode bs1 of Just (_,y) -> loop (a+y) (n-1) (bdrop y bs1) Nothing -> (bs, empty) -- | @take n s@ returns the first @n@ characters of @s@. -- If @s@ has less than @n@ characters, then we return the whole of @s@. {-# INLINE take #-} take :: UTF8Bytes b s => s -> b -> b take n bs = fst (splitAt n bs) -- | @drop n s@ returns the @s@ without its first @n@ characters. -- If @s@ has less than @n@ characters, then we return an empty string. {-# INLINE drop #-} drop :: UTF8Bytes b s => s -> b -> b drop n bs = snd (splitAt n bs) -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as @\'\\0xFFFD\'@ to the predicate. {-# SPECIALIZE span :: (Char -> Bool) -> B.ByteString -> (B.ByteString,B.ByteString) #-} {-# SPECIALIZE span :: (Char -> Bool) -> L.ByteString -> (L.ByteString,L.ByteString) #-} {-# SPECIALIZE span :: (Char -> Bool) -> [Word8] -> ([Word8],[Word8]) #-} span :: UTF8Bytes b s => (Char -> Bool) -> b -> (b,b) span p bs = loop 0 bs where loop a cs = case decode cs of Just (c,n) | p c -> loop (a+n) (bdrop n cs) _ -> bsplit a bs -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that do not satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as @\'\\0xFFFD\'@ to the predicate. {-# INLINE break #-} break :: UTF8Bytes b s => (Char -> Bool) -> b -> (b,b) break p bs = span (not . p) bs -- | Get the first character of a byte string, if any. -- Malformed characters are replaced by @\'\\0xFFFD\'@. {-# INLINE uncons #-} uncons :: UTF8Bytes b s => b -> Maybe (Char,b) uncons bs = do (c,n) <- decode bs return (c, bdrop n bs) -- | Traverse a bytestring (right biased). {-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> B.ByteString -> a #-} {-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> L.ByteString -> a #-} {-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> [Word8] -> a #-} foldr :: UTF8Bytes b s => (Char -> a -> a) -> a -> b -> a foldr cons nil cs = case uncons cs of Just (a,as) -> cons a (foldr cons nil as) Nothing -> nil -- | Traverse a bytestring (left biased). -- This function is strict in the accumulator. {-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> B.ByteString -> a #-} {-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> L.ByteString -> a #-} {-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> [Word8] -> a #-} foldl :: UTF8Bytes b s => (a -> Char -> a) -> a -> b -> a foldl add acc cs = case uncons cs of Just (a,as) -> let v = add acc a in seq v (foldl add v as) Nothing -> acc -- | Counts the number of characters encoded in the bytestring. -- Note that this includes replacement characters. {-# SPECIALIZE length :: B.ByteString -> Int #-} {-# SPECIALIZE length :: L.ByteString -> Int64 #-} {-# SPECIALIZE length :: [Word8] -> Int #-} length :: UTF8Bytes b s => b -> s length b = loop 0 b where loop n xs = case decode xs of Just (_,m) -> loop (n+1) (bdrop m xs) Nothing -> n -- | Split a string into a list of lines. -- Lines are terminated by @\'\\n\'@ or the end of the string. -- Empty lines may not be terminated by the end of the string. -- See also 'lines''. {-# SPECIALIZE lines :: B.ByteString -> [B.ByteString] #-} {-# SPECIALIZE lines :: L.ByteString -> [L.ByteString] #-} {-# SPECIALIZE lines :: [Word8] -> [[Word8]] #-} lines :: UTF8Bytes b s => b -> [b] lines bs | null bs = [] lines bs = case elemIndex 10 bs of Just x -> let (xs,ys) = bsplit x bs in xs : lines (tail ys) Nothing -> [bs] -- | Split a string into a list of lines. -- Lines are terminated by @\'\\n\'@ or the end of the string. -- Empty lines may not be terminated by the end of the string. -- This function preserves the terminators. -- See also 'lines'. {-# SPECIALIZE lines' :: B.ByteString -> [B.ByteString] #-} {-# SPECIALIZE lines' :: L.ByteString -> [L.ByteString] #-} {-# SPECIALIZE lines' :: [Word8] -> [[Word8]] #-} lines' :: UTF8Bytes b s => b -> [b] lines' bs | null bs = [] lines' bs = case elemIndex 10 bs of Just x -> let (xs,ys) = bsplit (x+1) bs in xs : lines' ys Nothing -> [bs] ----------- -- Compatibility functions for base-2 unconsB :: B.ByteString -> Maybe (Word8,B.ByteString) unconsL :: L.ByteString -> Maybe (Word8,L.ByteString) #ifdef BYTESTRING_IN_BASE unconsB bs | B.null bs = Nothing | otherwise = Just (unsafeHead bs, unsafeTail bs) unconsL bs = case L.toChunks bs of (x:xs) | not (B.null x) -> Just (unsafeHead x, L.fromChunks (unsafeTail x:xs)) _ -> Nothing #else unconsB = B.uncons unconsL = L.uncons #endif utf8-string-1.0.2/Codec/Binary/UTF8/String.hs0000644000000000000000000001201307346545000016663 0ustar0000000000000000{-# LANGUAGE CPP #-} #if __GLASGOW_HASKELL__ >= 701 {-# LANGUAGE Trustworthy #-} #endif -- -- | -- Module : Codec.Binary.UTF8.String -- Copyright : (c) Eric Mertens 2007 -- License : BSD3-style (see LICENSE) -- -- Maintainer: emertens@galois.com -- Stability : experimental -- Portability : portable -- -- Support for encoding UTF8 Strings to and from @['Word8']@ -- module Codec.Binary.UTF8.String ( encode , decode , encodeString , decodeString , encodeChar , isUTF8Encoded , utf8Encode ) where import Data.Word (Word8,Word32) import Data.Bits ((.|.),(.&.),shiftL,shiftR) import Data.Char (chr,ord) default(Int) -- | Encode a string using 'encode' and store the result in a 'String'. encodeString :: String -> String encodeString xs = map (toEnum . fromEnum) (encode xs) -- | Decode a string using 'decode' using a 'String' as input. -- This is not safe but it is necessary if UTF-8 encoded text -- has been loaded into a 'String' prior to being decoded. decodeString :: String -> String decodeString xs = decode (map (toEnum . fromEnum) xs) replacement_character :: Char replacement_character = '\xfffd' -- | Encode a single Haskell 'Char' to a list of 'Word8' values, in UTF8 format. encodeChar :: Char -> [Word8] encodeChar = map fromIntegral . go . ord where go oc | oc <= 0x7f = [oc] | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) , 0x80 + oc .&. 0x3f ] | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) , 0x80 + ((oc `shiftR` 6) .&. 0x3f) , 0x80 + oc .&. 0x3f ] | otherwise = [ 0xf0 + (oc `shiftR` 18) , 0x80 + ((oc `shiftR` 12) .&. 0x3f) , 0x80 + ((oc `shiftR` 6) .&. 0x3f) , 0x80 + oc .&. 0x3f ] -- | Encode a Haskell 'String' to a list of 'Word8' values, in UTF8 format. encode :: String -> [Word8] encode = concatMap encodeChar -- -- | Decode a UTF8 string packed into a list of 'Word8' values, directly to 'String' -- decode :: [Word8] -> String decode [ ] = "" decode (c:cs) | c < 0x80 = chr (fromEnum c) : decode cs | c < 0xc0 = replacement_character : decode cs | c < 0xe0 = multi1 | c < 0xf0 = multi_byte 2 0xf 0x800 | c < 0xf8 = multi_byte 3 0x7 0x10000 | c < 0xfc = multi_byte 4 0x3 0x200000 | c < 0xfe = multi_byte 5 0x1 0x4000000 | otherwise = replacement_character : decode cs where multi1 = case cs of c1 : ds | c1 .&. 0xc0 == 0x80 -> let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f) in if d >= 0x000080 then toEnum d : decode ds else replacement_character : decode ds _ -> replacement_character : decode cs multi_byte :: Int -> Word8 -> Int -> [Char] multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask)) where aux 0 rs acc | overlong <= acc && acc <= 0x10ffff && (acc < 0xd800 || 0xdfff < acc) && (acc < 0xfffe || 0xffff < acc) = chr acc : decode rs | otherwise = replacement_character : decode rs aux n (r:rs) acc | r .&. 0xc0 == 0x80 = aux (n-1) rs $ shiftL acc 6 .|. fromEnum (r .&. 0x3f) aux _ rs _ = replacement_character : decode rs -- | @utf8Encode str@ is a convenience function; checks to see if -- @str@ isn't UTF-8 encoded before doing so. Sometimes useful, but -- you are better off keeping track of the encoding so as to avoid -- the cost of checking. utf8Encode :: String -> String utf8Encode str | isUTF8Encoded str = str | otherwise = encodeString str -- | @isUTF8Encoded str@ tries to recognize input string as being in UTF-8 form. isUTF8Encoded :: String -> Bool isUTF8Encoded [] = True isUTF8Encoded (x:xs) = case ox of _ | ox < 0x80 -> isUTF8Encoded xs | ox > 0xff -> False | ox < 0xc0 -> False | ox < 0xe0 -> check1 | ox < 0xf0 -> check_byte 2 0xf 0 | ox < 0xf8 -> check_byte 3 0x7 0x10000 | ox < 0xfc -> check_byte 4 0x3 0x200000 | ox < 0xfe -> check_byte 5 0x1 0x4000000 | otherwise -> False where ox = toW32 x toW32 :: Char -> Word32 toW32 ch = fromIntegral (fromEnum ch) check1 = case xs of [] -> False c1 : ds | oc .&. 0xc0 /= 0x80 || d < 0x000080 -> False | otherwise -> isUTF8Encoded ds where oc = toW32 c1 d = ((ox .&. 0x1f) `shiftL` 6) .|. (oc .&. 0x3f) check_byte :: Int -> Word32 -> Word32 -> Bool check_byte i mask overlong = aux i xs (ox .&. mask) where aux 0 rs acc | overlong <= acc && acc <= 0x10ffff && (acc < 0xd800 || 0xdfff < acc) && (acc < 0xfffe || 0xffff < acc) = isUTF8Encoded rs | otherwise = False aux n (r:rs) acc | toW32 r .&. 0xc0 == 0x80 = aux (n-1) rs (acc `shiftL` 6 .|. (toW32 r .&. 0x3f)) aux _ _ _ = False utf8-string-1.0.2/Data/ByteString/Lazy/0000755000000000000000000000000007346545000015757 5ustar0000000000000000utf8-string-1.0.2/Data/ByteString/Lazy/UTF8.hs0000644000000000000000000002577007346545000017054 0ustar0000000000000000{-# LANGUAGE CPP, BangPatterns #-} #if __GLASGOW_HASKELL__ >= 701 {-# LANGUAGE Trustworthy #-} #endif -- -- | -- Module : Data.ByteString.Lazy.UTF8 -- Copyright : (c) Iavor S. Diatchki 2009 -- License : BSD3-style (see LICENSE) -- -- Maintainer : emertens@galois.com -- Stability : experimental -- Portability : portable -- -- This module provides fast, validated encoding and decoding functions -- between 'ByteString's and 'String's. It does not exactly match the -- output of the Codec.Binary.UTF8.String output for invalid encodings -- as the number of replacement characters is sometimes longer. module Data.ByteString.Lazy.UTF8 ( B.ByteString , decode , replacement_char , uncons , splitAt , take , drop , span , break , fromString , toString , foldl , foldr , length , lines , lines' ) where import Data.Bits import Data.Word import Data.Int import Foreign.Storable import Foreign.Ptr import Foreign.ForeignPtr import Data.Char (ord) import Control.Exception (assert) import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Internal as B import qualified Data.ByteString.Internal as S import Prelude hiding (take,drop,splitAt,span,break,foldr,foldl,length,lines) import Codec.Binary.UTF8.Generic (buncons) #if MIN_VERSION_base(4,4,0) import System.IO.Unsafe (unsafeDupablePerformIO) #else import GHC.IO (unsafeDupablePerformIO) #endif --------------------------------------------------------------------- -- ENCODING -- | Converts a Haskell string into a UTF8 encoded bytestring. fromString :: String -> B.ByteString fromString [] = B.empty fromString xs0 = packChunks 32 xs0 where packChunks n xs = case packUptoLenBytes n xs of (bs, [] ) -> B.chunk bs B.Empty (bs, xs') -> B.Chunk bs (packChunks (min (n * 2) B.smallChunkSize) xs') packUptoLenBytes :: Int -> String -> (S.ByteString, String) packUptoLenBytes len xs = unsafeCreateUptoN' len $ \ptr -> do (end, xs') <- go ptr (ptr `plusPtr` (len-4)) xs return (end `minusPtr` ptr, xs') -- end is the last position at which you can write a whole 4 byte sequence safely go :: Ptr Word8 -> Ptr Word8 -> String -> IO (Ptr Word8, String) go !ptr !end xs | ptr > end = return (ptr, xs) go !ptr !_ [] = return (ptr, []) go !ptr !end (x:xs) | x <= '\x7f' = poke ptr (S.c2w x) >> go (plusPtr ptr 1) end xs | otherwise = case ord x of oc | oc <= 0x7ff -> do poke ptr $ fromIntegral $ 0xc0 + (oc `shiftR` 6) pokeElemOff ptr 1 $ fromIntegral $ 0x80 + oc .&. 0x3f go (plusPtr ptr 2) end xs | oc <= 0xffff -> do poke ptr $ fromIntegral $ 0xe0 + (oc `shiftR` 12) pokeElemOff ptr 1 $ fromIntegral $ 0x80 + ((oc `shiftR` 6) .&. 0x3f) pokeElemOff ptr 2 $ fromIntegral $ 0x80 + oc .&. 0x3f go (plusPtr ptr 3) end xs | otherwise -> do poke ptr $ fromIntegral $ 0xf0 + (oc `shiftR` 18) pokeElemOff ptr 1 $ fromIntegral $ 0x80 + ((oc `shiftR` 12) .&. 0x3f) pokeElemOff ptr 2 $ fromIntegral $ 0x80 + ((oc `shiftR` 6) .&. 0x3f) pokeElemOff ptr 3 $ fromIntegral $ 0x80 + oc .&. 0x3f go (plusPtr ptr 4) end xs --------------------------------------------------------------------- -- DECODING -- | Convert a UTF8 encoded bytestring into a Haskell string. -- Invalid characters are replaced with @\'\\0xFFFD\'@. toString :: B.ByteString -> String toString bs = foldr (:) [] bs -- | This character is used to mark errors in a UTF8 encoded string. replacement_char :: Char replacement_char = '\xfffd' -- | Try to extract a character from a byte string. -- Returns 'Nothing' if there are no more bytes in the byte string. -- Otherwise, it returns a decoded character and the number of -- bytes used in its representation. -- Errors are replaced by character @\'\\0xFFFD\'@. -- XXX: Should we combine sequences of errors into a single replacement -- character? decode :: B.ByteString -> Maybe (Char,Int64) decode bs = do (c,cs) <- buncons bs return (choose (fromEnum c) cs) where choose :: Int -> B.ByteString -> (Char, Int64) choose c cs | c < 0x80 = (toEnum $ fromEnum c, 1) | c < 0xc0 = (replacement_char, 1) | c < 0xe0 = bytes2 (mask c 0x1f) cs | c < 0xf0 = bytes3 (mask c 0x0f) cs | c < 0xf8 = bytes4 (mask c 0x07) cs | otherwise = (replacement_char, 1) mask :: Int -> Int -> Int mask c m = fromEnum (c .&. m) combine :: Int -> Word8 -> Int combine acc r = shiftL acc 6 .|. fromEnum (r .&. 0x3f) follower :: Int -> Word8 -> Maybe Int follower acc r | r .&. 0xc0 == 0x80 = Just (combine acc r) follower _ _ = Nothing {-# INLINE get_follower #-} get_follower :: Int -> B.ByteString -> Maybe (Int, B.ByteString) get_follower acc cs = do (x,xs) <- buncons cs acc1 <- follower acc x return (acc1,xs) bytes2 :: Int -> B.ByteString -> (Char, Int64) bytes2 c cs = case get_follower c cs of Just (d, _) | d >= 0x80 -> (toEnum d, 2) | otherwise -> (replacement_char, 1) _ -> (replacement_char, 1) bytes3 :: Int -> B.ByteString -> (Char, Int64) bytes3 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d, _) | (d >= 0x800 && d < 0xd800) || (d > 0xdfff && d < 0xfffe) -> (toEnum d, 3) | otherwise -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) bytes4 :: Int -> B.ByteString -> (Char, Int64) bytes4 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d2, cs2) -> case get_follower d2 cs2 of Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4) | otherwise -> (replacement_char, 4) _ -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) {-# INLINE decode #-} -- | Split after a given number of characters. -- Negative values are treated as if they are 0. splitAt :: Int64 -> B.ByteString -> (B.ByteString,B.ByteString) splitAt x bs = loop 0 x bs where loop !a n _ | n <= 0 = B.splitAt a bs loop !a n bs1 = case decode bs1 of Just (_,y) -> loop (a+y) (n-1) (B.drop y bs1) Nothing -> (bs, B.empty) -- | @take n s@ returns the first @n@ characters of @s@. -- If @s@ has less than @n@ characters, then we return the whole of @s@. take :: Int64 -> B.ByteString -> B.ByteString take x bs = loop 0 x bs where loop !a n _ | n <= 0 = B.take a bs loop !a n bs1 = case decode bs1 of Just (_,y) -> loop (a+y) (n-1) (B.drop y bs1) Nothing -> bs -- | @drop n s@ returns the @s@ without its first @n@ characters. -- If @s@ has less than @n@ characters, then we return an empty string. drop :: Int64 -> B.ByteString -> B.ByteString drop x bs = loop 0 x bs where loop !a n _ | n <= 0 = B.drop a bs loop !a n bs1 = case decode bs1 of Just (_,y) -> loop (a+y) (n-1) (B.drop y bs1) Nothing -> B.empty -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as @\'\\0xFFFD\'@ to the predicate. span :: (Char -> Bool) -> B.ByteString -> (B.ByteString, B.ByteString) span p bs = loop 0 bs where loop a cs = case decode cs of Just (c,n) | p c -> loop (a+n) (B.drop n cs) _ -> B.splitAt a bs -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that do not satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as @\'\\0xFFFD\'@ to the predicate. break :: (Char -> Bool) -> B.ByteString -> (B.ByteString, B.ByteString) break p bs = span (not . p) bs -- | Get the first character of a byte string, if any. -- Malformed characters are replaced by @\'\\0xFFFD\'@. uncons :: B.ByteString -> Maybe (Char,B.ByteString) uncons bs = do (c,n) <- decode bs return (c, B.drop n bs) -- | Traverse a bytestring (right biased). foldr :: (Char -> a -> a) -> a -> B.ByteString -> a foldr cons nil cs = case uncons cs of Just (a,as) -> cons a (foldr cons nil as) Nothing -> nil -- | Traverse a bytestring (left biased). -- This function is strict in the accumulator. foldl :: (a -> Char -> a) -> a -> B.ByteString -> a foldl add acc cs = case uncons cs of Just (a,as) -> let v = add acc a in seq v (foldl add v as) Nothing -> acc -- | Counts the number of characters encoded in the bytestring. -- Note that this includes replacement characters. length :: B.ByteString -> Int length b = loop 0 b where loop n xs = case decode xs of Just (_,m) -> loop (n+1) (B.drop m xs) Nothing -> n -- | Split a string into a list of lines. -- Lines are terminated by @\'\\n\'@ or the end of the string. -- Empty lines may not be terminated by the end of the string. -- See also 'lines''. lines :: B.ByteString -> [B.ByteString] lines bs | B.null bs = [] lines bs = case B.elemIndex 10 bs of Just x -> let (xs,ys) = B.splitAt x bs in xs : lines (B.tail ys) Nothing -> [bs] -- | Split a string into a list of lines. -- Lines are terminated by @\'\\n\'@ or the end of the string. -- Empty lines may not be terminated by the end of the string. -- This function preserves the terminators. -- See also 'lines'. lines' :: B.ByteString -> [B.ByteString] lines' bs | B.null bs = [] lines' bs = case B.elemIndex 10 bs of Just x -> let (xs,ys) = B.splitAt (x+1) bs in xs : lines' ys Nothing -> [bs] --------------------------------------------------------------------- -- COPIED FROM BYTESTRING -- These functions are copied verbatum from Data.ByteString.Internal -- I suspect their lack of export is an oversight unsafeCreateUptoN' :: Int -> (Ptr Word8 -> IO (Int, a)) -> (S.ByteString, a) unsafeCreateUptoN' l f = unsafeDupablePerformIO (createUptoN' l f) {-# INLINE unsafeCreateUptoN' #-} -- | Create ByteString of up to size @l@ and use action @f@ to fill it's contents which returns its true size. createUptoN' :: Int -> (Ptr Word8 -> IO (Int, a)) -> IO (S.ByteString, a) createUptoN' l f = do fp <- S.mallocByteString l (l', res) <- withForeignPtr fp $ \p -> f p #if MIN_VERSION_bytestring(0,11,0) let bs = S.BS fp l' #else let bs = S.PS fp 0 l' #endif assert (l' <= l) $ return (bs, res) {-# INLINE createUptoN' #-} utf8-string-1.0.2/Data/ByteString/0000755000000000000000000000000007346545000015040 5ustar0000000000000000utf8-string-1.0.2/Data/ByteString/UTF8.hs0000644000000000000000000001721707346545000016132 0ustar0000000000000000{-# LANGUAGE CPP #-} #if __GLASGOW_HASKELL__ >= 701 {-# LANGUAGE Trustworthy #-} #endif -- -- | -- Module : Data.ByteString.UTF8 -- Copyright : (c) Iavor S. Diatchki 2009 -- License : BSD3-style (see LICENSE) -- -- Maintainer : emertens@galois.com -- Stability : experimental -- Portability : portable -- -- This module provides fast, validated encoding and decoding functions -- between 'ByteString's and 'String's. It does not exactly match the -- output of the Codec.Binary.UTF8.String output for invalid encodings -- as the number of replacement characters is sometimes longer. module Data.ByteString.UTF8 ( B.ByteString , decode , replacement_char , uncons , splitAt , take , drop , span , break , fromChar , fromString , toString , foldl , foldr , length , lines , lines' ) where import Data.Bits import Data.Word import qualified Data.ByteString as B import Prelude hiding (take,drop,splitAt,span,break,foldr,foldl,length,lines) import Codec.Binary.UTF8.String(encode) import Codec.Binary.UTF8.Generic (buncons) -- | Converts a Haskell char into a UTF8 encoded bytestring. fromChar :: Char -> B.ByteString fromChar x = fromString [x] -- | Converts a Haskell string into a UTF8 encoded bytestring. fromString :: String -> B.ByteString fromString xs = B.pack (encode xs) -- | Convert a UTF8 encoded bytestring into a Haskell string. -- Invalid characters are replaced with @\'\\0xFFFD\'@. toString :: B.ByteString -> String toString bs = foldr (:) [] bs -- | This character is used to mark errors in a UTF8 encoded string. replacement_char :: Char replacement_char = '\xfffd' -- | Try to extract a character from a byte string. -- Returns 'Nothing' if there are no more bytes in the byte string. -- Otherwise, it returns a decoded character and the number of -- bytes used in its representation. -- Errors are replaced by character @\'\\0xFFFD\'@. -- XXX: Should we combine sequences of errors into a single replacement -- character? decode :: B.ByteString -> Maybe (Char,Int) decode bs = do (c,cs) <- buncons bs return (choose (fromEnum c) cs) where choose :: Int -> B.ByteString -> (Char, Int) choose c cs | c < 0x80 = (toEnum $ fromEnum c, 1) | c < 0xc0 = (replacement_char, 1) | c < 0xe0 = bytes2 (mask c 0x1f) cs | c < 0xf0 = bytes3 (mask c 0x0f) cs | c < 0xf8 = bytes4 (mask c 0x07) cs | otherwise = (replacement_char, 1) mask :: Int -> Int -> Int mask c m = fromEnum (c .&. m) combine :: Int -> Word8 -> Int combine acc r = shiftL acc 6 .|. fromEnum (r .&. 0x3f) follower :: Int -> Word8 -> Maybe Int follower acc r | r .&. 0xc0 == 0x80 = Just (combine acc r) follower _ _ = Nothing {-# INLINE get_follower #-} get_follower :: Int -> B.ByteString -> Maybe (Int, B.ByteString) get_follower acc cs = do (x,xs) <- buncons cs acc1 <- follower acc x return (acc1,xs) bytes2 :: Int -> B.ByteString -> (Char, Int) bytes2 c cs = case get_follower c cs of Just (d, _) | d >= 0x80 -> (toEnum d, 2) | otherwise -> (replacement_char, 1) _ -> (replacement_char, 1) bytes3 :: Int -> B.ByteString -> (Char, Int) bytes3 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d, _) | (d >= 0x800 && d < 0xd800) || (d > 0xdfff && d < 0xfffe) -> (toEnum d, 3) | otherwise -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) bytes4 :: Int -> B.ByteString -> (Char, Int) bytes4 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d2, cs2) -> case get_follower d2 cs2 of Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4) | otherwise -> (replacement_char, 4) _ -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) -- | Split after a given number of characters. -- Negative values are treated as if they are 0. splitAt :: Int -> B.ByteString -> (B.ByteString,B.ByteString) splitAt x bs = loop 0 x bs where loop a n _ | n <= 0 = B.splitAt a bs loop a n bs1 = case decode bs1 of Just (_,y) -> loop (a+y) (n-1) (B.drop y bs1) Nothing -> (bs, B.empty) -- | @take n s@ returns the first @n@ characters of @s@. -- If @s@ has less than @n@ characters, then we return the whole of @s@. take :: Int -> B.ByteString -> B.ByteString take n bs = fst (splitAt n bs) -- | @drop n s@ returns the @s@ without its first @n@ characters. -- If @s@ has less than @n@ characters, then we return an empty string. drop :: Int -> B.ByteString -> B.ByteString drop n bs = snd (splitAt n bs) -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as @\'\\0xFFFD\'@ to the predicate. span :: (Char -> Bool) -> B.ByteString -> (B.ByteString, B.ByteString) span p bs = loop 0 bs where loop a cs = case decode cs of Just (c,n) | p c -> loop (a+n) (B.drop n cs) _ -> B.splitAt a bs -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that do not satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as @\'\\0xFFFD\'@ to the predicate. break :: (Char -> Bool) -> B.ByteString -> (B.ByteString, B.ByteString) break p bs = span (not . p) bs -- | Get the first character of a byte string, if any. -- Malformed characters are replaced by @\'\\0xFFFD\'@. uncons :: B.ByteString -> Maybe (Char,B.ByteString) uncons bs = do (c,n) <- decode bs return (c, B.drop n bs) -- | Traverse a bytestring (right biased). foldr :: (Char -> a -> a) -> a -> B.ByteString -> a foldr cons nil cs = case uncons cs of Just (a,as) -> cons a (foldr cons nil as) Nothing -> nil -- | Traverse a bytestring (left biased). -- This function is strict in the accumulator. foldl :: (a -> Char -> a) -> a -> B.ByteString -> a foldl add acc cs = case uncons cs of Just (a,as) -> let v = add acc a in seq v (foldl add v as) Nothing -> acc -- | Counts the number of characters encoded in the bytestring. -- Note that this includes replacement characters. length :: B.ByteString -> Int length b = loop 0 b where loop n xs = case decode xs of Just (_,m) -> loop (n+1) (B.drop m xs) Nothing -> n -- | Split a string into a list of lines. -- Lines are terminated by @\'\\n\'@ or the end of the string. -- Empty lines may not be terminated by the end of the string. -- See also 'lines''. lines :: B.ByteString -> [B.ByteString] lines bs | B.null bs = [] lines bs = case B.elemIndex 10 bs of Just x -> let (xs,ys) = B.splitAt x bs in xs : lines (B.tail ys) Nothing -> [bs] -- | Split a string into a list of lines. -- Lines are terminated by @\'\\n\'@ or the end of the string. -- Empty lines may not be terminated by the end of the string. -- This function preserves the terminators. -- See also 'lines'. lines' :: B.ByteString -> [B.ByteString] lines' bs | B.null bs = [] lines' bs = case B.elemIndex 10 bs of Just x -> let (xs,ys) = B.splitAt (x+1) bs in xs : lines' ys Nothing -> [bs] utf8-string-1.0.2/Data/String/0000755000000000000000000000000007346545000014214 5ustar0000000000000000utf8-string-1.0.2/Data/String/UTF8.hs0000644000000000000000000001435707346545000015310 0ustar0000000000000000{-# LANGUAGE CPP #-} #if __GLASGOW_HASKELL__ >= 701 {-# LANGUAGE Trustworthy #-} #endif -- -- | -- Module : Data.String.UTF8 -- Copyright : (c) Iavor S. Diatchki 2009 -- License : BSD3-style (see LICENSE) -- -- Maintainer : emertens@galois.com -- Stability : experimental -- Portability : portable -- {-# LANGUAGE MultiParamTypeClasses, UndecidableInstances #-} module Data.String.UTF8 ( -- * Representation UTF8 , UTF8Bytes() , fromString , toString , fromRep , toRep , G.replacement_char -- * Character based operations , uncons , splitAt , take , drop , span , break , foldl , foldr , length , lines , lines' -- * Representation based operations , null , decode , byteSplitAt , byteTake , byteDrop ) where import Prelude hiding (null,take,drop,span,break ,foldl,foldr,length,lines,splitAt) import qualified Codec.Binary.UTF8.Generic as G import Codec.Binary.UTF8.Generic (UTF8Bytes) import qualified Data.String as S -- | The type of strings that are represented using the UTF8 encoding. -- The parameter is the type of the container for the representation. newtype UTF8 string = Str string deriving (Eq,Ord) -- XXX: Is this OK? instance UTF8Bytes string index => Show (UTF8 string) where show x = show (toString x) instance UTF8Bytes string index => S.IsString (UTF8 string) where fromString = fromString fromRep :: string -> UTF8 string fromRep = Str toRep :: UTF8 string -> string toRep (Str x) = x -- | Converts a Haskell string into a UTF8 encoded string. -- Complexity: linear. fromString :: UTF8Bytes string index => String -> UTF8 string fromString xs = Str (G.fromString xs) -- | Convert a UTF8 encoded string into a Haskell string. -- Invalid characters are replaced by 'G.replacement_char'. -- Complexity: linear. toString :: UTF8Bytes string index => UTF8 string -> String toString (Str xs) = G.toString xs -- | Checks if there are no more bytes in the underlying representation. null :: UTF8Bytes string index => UTF8 string -> Bool null (Str x) = G.null x -- | Split after a given number of characters. -- Negative values are treated as if they are 0. splitAt :: UTF8Bytes string index => index -> UTF8 string -> (UTF8 string, UTF8 string) splitAt x (Str bs) = case G.splitAt x bs of (s1,s2) -> (Str s1, Str s2) -- | Split after a given number of bytes in the underlying representation. -- See also 'splitAt'. byteSplitAt :: UTF8Bytes string index => index -> UTF8 string -> (UTF8 string, UTF8 string) byteSplitAt n (Str x) = case G.bsplit n x of (as,bs) -> (Str as, Str bs) -- | Take only the given number of bytes from the underlying representation. -- See also 'take'. byteTake :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string byteTake n (Str x) = Str (fst (G.bsplit n x)) -- | Drop the given number of bytes from the underlying representation. -- See also 'drop'. byteDrop :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string byteDrop n (Str x) = Str (G.bdrop n x) -- | @take n s@ returns the first @n@ characters of @s@. -- If @s@ has less than @n@ characters, then we return the whole of @s@. take :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string take n (Str bs) = Str (G.take n bs) -- | @drop n s@ returns the @s@ without its first @n@ characters. -- If @s@ has less than @n@ characters, then we return an empty string. drop :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string drop n (Str bs) = Str (G.drop n bs) -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as @\'\\0xFFFD\'@ to the predicate. span :: UTF8Bytes string index => (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string) span p (Str bs) = case G.span p bs of (s1,s2) -> (Str s1, Str s2) -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that do not satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as 'G.replacement_char' to the predicate. break :: UTF8Bytes string index => (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string) break p (Str bs) = case G.break p bs of (s1,s2) -> (Str s1, Str s2) -- | Get the first character of a byte string, if any. -- Invalid characters are replaced by 'G.replacement_char'. uncons :: UTF8Bytes string index => UTF8 string -> Maybe (Char, UTF8 string) uncons (Str x) = do (c,y) <- G.uncons x return (c, Str y) -- | Extract the first character for the underlying representation, -- if one is available. It also returns the number of bytes used -- in the representation of the character. -- See also 'uncons'. decode :: UTF8Bytes string index => UTF8 string -> Maybe (Char, index) decode (Str x) = G.decode x -- | Traverse a bytestring (right biased). foldr :: UTF8Bytes string index => (Char -> a -> a) -> a -> UTF8 string -> a foldr cons nil (Str cs) = G.foldr cons nil cs -- | Traverse a bytestring (left biased). -- This function is strict in the accumulator. foldl :: UTF8Bytes string index => (a -> Char -> a) -> a -> UTF8 string -> a foldl add acc (Str cs) = G.foldl add acc cs -- | Counts the number of characters encoded in the bytestring. -- Note that this includes replacement characters. -- The function is linear in the number of bytes in the representation. length :: UTF8Bytes string index => UTF8 string -> index length (Str b) = G.length b -- | Split a string into a list of lines. -- Lines are terminated by @\'\\n\'@ or the end of the string. -- Empty lines may not be terminated by the end of the string. -- See also 'lines''. lines :: UTF8Bytes string index => UTF8 string -> [UTF8 string] lines (Str b) = map Str (G.lines b) -- XXX: unnecessary map -- | Split a string into a list of lines. -- Lines are terminated by @\'\\n\'@ or the end of the string. -- Empty lines may not be terminated by the end of the string. -- This function preserves the terminators. -- See also 'lines'. lines' :: UTF8Bytes string index => UTF8 string -> [UTF8 string] lines' (Str x) = map Str (G.lines' x) -- XXX: unnecessary map utf8-string-1.0.2/LICENSE0000644000000000000000000000274107346545000013106 0ustar0000000000000000* Copyright (c) 2007, Galois Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Galois Inc. nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY Galois Inc. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL Galois Inc. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. utf8-string-1.0.2/Setup.lhs0000644000000000000000000000011407346545000013701 0ustar0000000000000000#!/usr/bin/env runhaskell > import Distribution.Simple > main = defaultMain utf8-string-1.0.2/tests/0000755000000000000000000000000007346545000013237 5ustar0000000000000000utf8-string-1.0.2/tests/Tests.hs0000644000000000000000000002213007346545000014673 0ustar0000000000000000import Codec.Binary.UTF8.String import Test.HUnit (Test (TestCase, TestList, TestLabel), assertEqual, errors, failures, runTestTT) import System.Exit (exitFailure) import Control.Monad (when) main :: IO () main = do counts <- runTestTT tests when (errors counts > 0 || failures counts > 0) exitFailure tests :: Test tests = TestList [test_1, test_2, test_3, test_4, test_5, test_6] test_1 :: Test test_1 = TestLabel "1 Some correct UTF-8 text" $ TestCase $ assertEqual "kosme, " "\x03ba\x1f79\x03c3\x03bc\x03b5 " (decode [0xce,0xba,0xe1,0xbd,0xb9,0xcf,0x83,0xce,0xbc,0xce,0xb5,0x20]) test_2 :: Test test_2 = TestLabel "2 Boundary condition test cases" $ TestList [test_2_1, test_2_2, test_2_3] test_2_1 :: Test test_2_1 = TestLabel "2.1 First possible sequence of a certain length" $ TestList $ map TestCase $ [ assertEqual "2.1.1, " "\0\0" (decode [0, 0]) , assertEqual "2.1.2, " "\x80\0" (decode [0xc2, 0x80, 0]) , assertEqual "2.1.3, " "\x800\0" (decode [0xe0, 0xa0, 0x80, 0]) , assertEqual "2.1.4, " "\x10000\0" (decode [0xf0, 0x90, 0x80, 0x80, 0]) , assertEqual "2.1.5, " "\xfffd\0" (decode [0xf8, 0x88, 0x80, 0x80, 0x80, 0]) , assertEqual "2.1.6, " "\xfffd\0" (decode [0xfc,0x84,0x80,0x80,0x80,0x80,0]) ] test_2_2 :: Test test_2_2 = TestLabel "2.2 Last possible sequence of a certain length" $ TestList $ map TestCase $ [ assertEqual "2.2.1, " "\x7f\0" (decode [0x7f, 0]) , assertEqual "2.2.2, " "\x7ff\0" (decode [0xdf, 0xbf, 0]) , assertEqual "2.2.3, " "\xfffd\0" (decode [0xef, 0xbf, 0xbf, 0]) , assertEqual "2.2.4, " "\xfffd\0" (decode [0xf7, 0xbf, 0xbf, 0xbf, 0]) , assertEqual "2.2.5, " "\xfffd\0" (decode [0xfb, 0xbf, 0xbf, 0xbf, 0xbf, 0]) , assertEqual "2.2.6, " "\xfffd\0" (decode [0xfd,0xbf,0xbf,0xbf,0xbf,0xbf,0]) ] test_2_3 :: Test test_2_3 = TestLabel "2.3 Other boundary conditions" $ TestList $ map TestCase $ [ assertEqual "2.3.1, " "\xd7ff\0" (decode [0xed, 0x9f, 0xbf, 0]) , assertEqual "2.3.2, " "\xe000\0" (decode [0xee, 0x80, 0x80, 0]) , assertEqual "2.3.3, " "\xfffd\0" (decode [0xef, 0xbf, 0xbd, 0]) , assertEqual "2.3.4, " "\x10ffff\0" (decode [0xf4, 0x8f, 0xbf, 0xbf, 0]) , assertEqual "2.3.5, " "\xfffd\0" (decode [0xf4, 0x90, 0x80, 0x80, 0]) ] test_3 :: Test test_3 = TestLabel "3 Malformed sequences" $ TestList [test_3_1, test_3_2, test_3_3, test_3_4, test_3_5] test_3_1 :: Test test_3_1 = TestLabel "3.1 Unexpected continuation bytes" $ TestList $ map TestCase $ [ assertEqual "3.1.1, " "\xfffd\0" (decode [0x80, 0]) , assertEqual "3.1.2, " "\xfffd\0" (decode [0xbf, 0]) , assertEqual "3.1.3, " "\xfffd\xfffd\0" (decode [0x80, 0xbf, 0]) , assertEqual "3.1.4, " "\xfffd\xfffd\xfffd\0" (decode [0x80, 0xbf, 0x80, 0]) , assertEqual "3.1.5, " "\xfffd\xfffd\xfffd\xfffd\0" (decode [0x80, 0xbf, 0x80, 0xbf, 0]) , assertEqual "3.1.6, " "\xfffd\xfffd\xfffd\xfffd\xfffd\0" (decode [0x80, 0xbf, 0x80, 0xbf, 0x80, 0]) , assertEqual "3.1.7, " "\xfffd\xfffd\xfffd\xfffd\xfffd\xfffd\0" (decode [0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 0]) , assertEqual "3.1.8, " "\xfffd\xfffd\xfffd\xfffd\xfffd\xfffd\xfffd\0" (decode [0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 0x80, 0]) , assertEqual "3.1.9, " (replicate 64 '\xfffd') (decode [0x80..0xbf]) ] test_3_2 :: Test test_3_2 = TestLabel "3.2 Lonely start characters" $ TestList $ map TestCase $ [ assertEqual "3.2.1, " (concat (replicate 32 "\xfffd ")) (decode (concat [[x,0x20] | x <- [0xc0..0xdf]])) , assertEqual "3.2.2, " (concat (replicate 16 "\xfffd ")) (decode (concat [[x,0x20] | x <- [0xe0..0xef]])) , assertEqual "3.2.3, " (concat (replicate 8 "\xfffd ")) (decode (concat [[x,0x20] | x <- [0xf0..0xf7]])) , assertEqual "3.2.4, " "\xfffd \xfffd \xfffd \xfffd " (decode (concat [[x,0x20] | x <- [0xf8..0xfb]])) , assertEqual "3.2.5, " "\xfffd \xfffd " (decode [0xfc, 0x20, 0xfd, 0x20]) ] test_3_3 :: Test test_3_3 = TestLabel "3.3 Sequences with last continuation byte missing" $ TestList $ map TestCase $ [ assertEqual "3.3.1, " "\xfffd " (decode [0xc0, 0x20]) , assertEqual "3.3.2, " "\xfffd " (decode [0xe0, 0x80, 0x20]) , assertEqual "3.3.3, " "\xfffd " (decode [0xf0, 0x80, 0x80, 0x20]) , assertEqual "3.3.4, " "\xfffd " (decode [0xf8, 0x80, 0x80, 0x80, 0x20]) , assertEqual "3.3.5, " "\xfffd " (decode [0xfc, 0x80, 0x80, 0x80,0x80,0x20]) , assertEqual "3.3.6, " "\xfffd " (decode [0xdf, 0x20]) , assertEqual "3.3.7, " "\xfffd " (decode [0xef, 0xbf, 0x20]) , assertEqual "3.3.8, " "\xfffd " (decode [0xf7, 0xbf, 0xbf, 0x20]) , assertEqual "3.3.9, " "\xfffd " (decode [0xfb, 0xbf, 0xbf, 0xbf, 0x20]) , assertEqual "3.3.10, " "\xfffd " (decode [0xfd, 0xbf, 0xbf, 0xbf,0xbf,0x20]) ] test_3_4 :: Test test_3_4 = TestLabel "3.4 Concatenation of incomplete sequences" $ TestCase $ assertEqual "3.4, " (replicate 10 '\xfffd') (decode [0xc0, 0xe0, 0x80, 0xf0, 0x80, 0x80, 0xf8, 0x80, 0x80, 0x80, 0xfc, 0x80, 0x80, 0x80,0x80, 0xdf, 0xef, 0xbf, 0xf7, 0xbf, 0xbf, 0xfb, 0xbf, 0xbf, 0xbf, 0xfd, 0xbf, 0xbf, 0xbf,0xbf]) test_3_5 :: Test test_3_5 = TestLabel "3.5 Impossible bytes" $ TestList $ map TestCase $ [ assertEqual "3.5.1, " "\xfffd " (decode [0xfe, 0x20]) , assertEqual "3.5.2, " "\xfffd " (decode [0xff, 0x20]) , assertEqual "3.5.3, " "\xfffd\xfffd\xfffd\xfffd " (decode [0xfe, 0xfe, 0xff, 0xff, 0x20]) ] test_4 :: Test test_4 = TestLabel "4 Overlong sequences" $ TestList [test_4_1, test_4_2, test_4_3] test_4_1 :: Test test_4_1 = TestLabel "4.1" $ TestList $ map TestCase $ [ assertEqual "4.1.1, " "\xfffd " (decode [0xc0, 0xaf, 0x20]) , assertEqual "4.1.2, " "\xfffd " (decode [0xe0, 0x80, 0xaf, 0x20]) , assertEqual "4.1.3, " "\xfffd " (decode [0xf0, 0x80, 0x80, 0xaf, 0x20]) , assertEqual "4.1.4, " "\xfffd " (decode [0xf8, 0x80, 0x80,0x80,0xaf, 0x20]) , assertEqual "4.1.5, " "\xfffd " (decode[0xfc,0x80,0x80,0x80,0x80,0xaf,0x20]) ] test_4_2 :: Test test_4_2 = TestLabel "4.2 Maximum overlong sequences" $ TestList $ map TestCase $ [ assertEqual "4.2.1, " "\xfffd " (decode [0xc1, 0xbf, 0x20]) , assertEqual "4.2.2, " "\xfffd " (decode [0xe0, 0x9f, 0xbf, 0x20]) , assertEqual "4.2.3, " "\xfffd " (decode [0xf0, 0x8f, 0xbf, 0xbf, 0x20]) , assertEqual "4.2.4, " "\xfffd " (decode [0xf8, 0x87, 0xbf, 0xbf,0xbf,0x20]) , assertEqual "4.2.5, " "\xfffd "(decode[0xfc,0x83,0xbf,0xbf,0xbf,0xbf,0x20]) ] test_4_3 :: Test test_4_3 = TestLabel "4.2 Overlong NUL" $ TestList $ map TestCase $ [ assertEqual "4.3.1, " "\xfffd " (decode [0xc0, 0x80, 0x20]) , assertEqual "4.3.2, " "\xfffd " (decode [0xe0, 0x80, 0x80, 0x20]) , assertEqual "4.3.3, " "\xfffd " (decode [0xf0, 0x80, 0x80, 0x80, 0x20]) , assertEqual "4.3.4, " "\xfffd " (decode [0xf8, 0x80, 0x80, 0x80,0x80,0x20]) , assertEqual "4.3.5, " "\xfffd "(decode[0xfc,0x80,0x80,0x80,0x80,0x80,0x20]) ] test_5 :: Test test_5 = TestLabel "Illegal code positions" $ TestList [test_5_1, test_5_2, test_5_3] test_5_1 :: Test test_5_1 = TestLabel "5.1 Single UTF-16 surrogates" $ TestList $ map TestCase $ [ assertEqual "5.1.1, " "\xfffd " (decode [0xed,0xa0,0x80,0x20]) , assertEqual "5.1.2, " "\xfffd " (decode [0xed,0xad,0xbf,0x20]) , assertEqual "5.1.3, " "\xfffd " (decode [0xed,0xae,0x80,0x20]) , assertEqual "5.1.4, " "\xfffd " (decode [0xed,0xaf,0xbf,0x20]) , assertEqual "5.1.5, " "\xfffd " (decode [0xed,0xb0,0x80,0x20]) , assertEqual "5.1.6, " "\xfffd " (decode [0xed,0xbe,0x80,0x20]) , assertEqual "5.1.7, " "\xfffd " (decode [0xed,0xbf,0xbf,0x20]) ] test_5_2 :: Test test_5_2 = TestLabel "5.2 Paired UTF-16 surrogates" $ TestList $ map TestCase $ [ assertEqual "5.2.1, " res (decode [0xed,0xa0,0x80,0xed,0xb0,0x80,0x20]) , assertEqual "5.2.2, " res (decode [0xed,0xa0,0x80,0xed,0xbf,0xbf,0x20]) , assertEqual "5.2.3, " res (decode [0xed,0xad,0xbf,0xed,0xb0,0x80,0x20]) , assertEqual "5.2.4, " res (decode [0xed,0xad,0xbf,0xed,0xbf,0xbf,0x20]) , assertEqual "5.2.5, " res (decode [0xed,0xae,0x80,0xed,0xb0,0x80,0x20]) , assertEqual "5.2.6, " res (decode [0xed,0xae,0x80,0xed,0xbf,0xbf,0x20]) , assertEqual "5.2.7, " res (decode [0xed,0xaf,0xbf,0xed,0xb0,0x80,0x20]) , assertEqual "5.2.8, " res (decode [0xed,0xaf,0xbf,0xed,0xbf,0xbf,0x20]) ] where res = "\xfffd\xfffd " test_5_3 :: Test test_5_3 = TestLabel "5.3 Other illegal code positions" $ TestList $ map TestCase $ [ assertEqual "5.3.1, " "\xfffd " (decode [0xef, 0xbf, 0xbe, 0x20]) , assertEqual "5.3.2, " "\xfffd " (decode [0xef, 0xbf, 0xbf, 0x20]) ] test_6 :: Test test_6 = TestLabel "Encode then decode" $ TestList $ map TestCase $ [ assertEqual "6.1" encodeDecodeTest [] ] -- -- test decode . encode == id for the class of chars we know that to be true of -- encodeDecodeTest :: [Char] encodeDecodeTest = filter (\x -> [x] /= decode (encode [x])) legal_codepoints ++ filter (\x -> ['\xfffd'] /= decode (encode [x])) illegal_codepoints where legal_codepoints = ['\0'..'\xd7ff'] ++ ['\xe000'..'\xfffd'] ++ ['\x10000'..'\x10ffff'] illegal_codepoints = '\xffff' : '\xfffe' : ['\xd800'..'\xdfff'] utf8-string-1.0.2/utf8-string.cabal0000644000000000000000000000300207346545000015246 0ustar0000000000000000Name: utf8-string Version: 1.0.2 Author: Eric Mertens Maintainer: emertens@galois.com License: BSD3 License-file: LICENSE Homepage: https://github.com/glguy/utf8-string/ Bug-Reports: https://github.com/glguy/utf8-string/issues Synopsis: Support for reading and writing UTF8 Strings Description: A UTF8 layer for Strings. The utf8-string package provides operations for encoding UTF8 strings to Word8 lists and back, and for reading and writing UTF8 without truncation. Category: Codec Build-type: Simple cabal-version: >= 1.10 Extra-Source-Files: CHANGELOG.markdown Tested-With: GHC==7.0.4, GHC==7.4.2, GHC==7.6.3, GHC==7.8.4, GHC==7.10.3, GHC==8.0.2, GHC==8.2.1 source-repository head type: git location: https://github.com/glguy/utf8-string library Ghc-options: -W -O2 build-depends: base >= 4.3 && < 5, bytestring >= 0.9 Exposed-modules: Codec.Binary.UTF8.String Codec.Binary.UTF8.Generic Data.String.UTF8 Data.ByteString.UTF8 Data.ByteString.Lazy.UTF8 default-language: Haskell2010 test-suite unit-tests type: exitcode-stdio-1.0 hs-source-dirs: tests main-is: Tests.hs build-depends: base, HUnit >= 1.3 && < 1.7, utf8-string default-language: Haskell2010