utf8-string-1.0.1.1/0000755000000000000000000000000012566371234012241 5ustar0000000000000000utf8-string-1.0.1.1/CHANGELOG.markdown0000644000000000000000000000045512566371234015300 0ustar00000000000000001.0.1.1 ----- * Build correctly on GHC-7.0 (#14) 1.0.1 ----- * Improve the performance of Data.ByteString.Lazy.UTF8.fromString. (Thanks, ndmitchell) 1 ----- * Remove out all the old utf8 IO support. GHC supports utf8 now. 0.3.8 ----- * Performance tweaks * bytestring-in-base flag default to False utf8-string-1.0.1.1/LICENSE0000644000000000000000000000274112566371234013252 0ustar0000000000000000* Copyright (c) 2007, Galois Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Galois Inc. nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY Galois Inc. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL Galois Inc. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. utf8-string-1.0.1.1/Setup.lhs0000644000000000000000000000011412566371234014045 0ustar0000000000000000#!/usr/bin/env runhaskell > import Distribution.Simple > main = defaultMain utf8-string-1.0.1.1/utf8-string.cabal0000644000000000000000000000212512566371234015417 0ustar0000000000000000Name: utf8-string Version: 1.0.1.1 Author: Eric Mertens Maintainer: emertens@galois.com License: BSD3 License-file: LICENSE Homepage: http://github.com/glguy/utf8-string/ Synopsis: Support for reading and writing UTF8 Strings Description: A UTF8 layer for Strings. The utf8-string package provides operations for encoding UTF8 strings to Word8 lists and back, and for reading and writing UTF8 without truncation. Category: Codec Build-type: Simple cabal-version: >= 1.2 Extra-Source-Files: CHANGELOG.markdown Tested-With: GHC==7.0.4, GHC==7.4.2, GHC==7.6.3, GHC==7.8.4, GHC==7.10.2 library Ghc-options: -W -O2 build-depends: base >= 4.3 && < 4.9, bytestring >= 0.9 Extensions: CPP Exposed-modules: Codec.Binary.UTF8.String Codec.Binary.UTF8.Generic Data.String.UTF8 Data.ByteString.UTF8 Data.ByteString.Lazy.UTF8 utf8-string-1.0.1.1/Codec/0000755000000000000000000000000012566371234013256 5ustar0000000000000000utf8-string-1.0.1.1/Codec/Binary/0000755000000000000000000000000012566371234014502 5ustar0000000000000000utf8-string-1.0.1.1/Codec/Binary/UTF8/0000755000000000000000000000000012566371234015230 5ustar0000000000000000utf8-string-1.0.1.1/Codec/Binary/UTF8/Generic.hs0000644000000000000000000002510012566371234017136 0ustar0000000000000000{-# LANGUAGE CPP, MultiParamTypeClasses, FunctionalDependencies, FlexibleInstances #-} #if __GLASGOW_HASKELL__ >= 701 {-# LANGUAGE Trustworthy #-} #endif -- -- | -- Module : Codec.Binary.UTF8.Generic -- Copyright : (c) Iavor S. Diatchki 2009 -- License : BSD3-style (see LICENSE) -- -- Maintainer : emertens@galois.com -- Stability : experimental -- Portability : portable -- module Codec.Binary.UTF8.Generic ( UTF8Bytes(..) , decode , replacement_char , uncons , splitAt , take , drop , span , break , fromString , toString , foldl , foldr , length , lines , lines' ) where import Data.Bits import Data.Int import Data.Word import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as L import qualified Data.List as List import Prelude hiding (take,drop,splitAt,span,break,foldr,foldl,length,lines,null,tail) import Codec.Binary.UTF8.String(encode) #ifdef BYTESTRING_IN_BASE import Data.ByteString.Base (unsafeHead, unsafeTail) #endif class (Num s, Ord s) => UTF8Bytes b s | b -> s where bsplit :: s -> b -> (b,b) bdrop :: s -> b -> b buncons :: b -> Maybe (Word8,b) elemIndex :: Word8 -> b -> Maybe s empty :: b null :: b -> Bool pack :: [Word8] -> b tail :: b -> b instance UTF8Bytes B.ByteString Int where bsplit = B.splitAt bdrop = B.drop buncons = unconsB elemIndex = B.elemIndex empty = B.empty null = B.null pack = B.pack tail = B.tail instance UTF8Bytes L.ByteString Int64 where bsplit = L.splitAt bdrop = L.drop buncons = unconsL elemIndex = L.elemIndex empty = L.empty null = L.null pack = L.pack tail = L.tail instance UTF8Bytes [Word8] Int where bsplit = List.splitAt bdrop = List.drop buncons (x:xs) = Just (x,xs) buncons [] = Nothing elemIndex x xs = List.elemIndex (toEnum (fromEnum x)) xs empty = [] null = List.null pack = id tail = List.tail -- | Converts a Haskell string into a UTF8 encoded bytestring. {-# SPECIALIZE fromString :: String -> B.ByteString #-} {-# SPECIALIZE fromString :: String -> L.ByteString #-} {-# SPECIALIZE fromString :: String -> [Word8] #-} fromString :: UTF8Bytes b s => String -> b fromString xs = pack (encode xs) -- | Convert a UTF8 encoded bytestring into a Haskell string. -- Invalid characters are replaced with '\xFFFD'. {-# SPECIALIZE toString :: B.ByteString -> String #-} {-# SPECIALIZE toString :: L.ByteString -> String #-} {-# SPECIALIZE toString :: [Word8] -> String #-} toString :: UTF8Bytes b s => b -> String toString bs = foldr (:) [] bs -- | This character is used to mark errors in a UTF8 encoded string. replacement_char :: Char replacement_char = '\xfffd' -- | Try to extract a character from a byte string. -- Returns 'Nothing' if there are no more bytes in the byte string. -- Otherwise, it returns a decoded character and the number of -- bytes used in its representation. -- Errors are replaced by character '\0xFFFD'. -- XXX: Should we combine sequences of errors into a single replacement -- character? {-# SPECIALIZE decode :: B.ByteString -> Maybe (Char,Int) #-} {-# SPECIALIZE decode :: L.ByteString -> Maybe (Char,Int64) #-} {-# SPECIALIZE decode :: [Word8] -> Maybe (Char,Int) #-} decode :: UTF8Bytes b s => b -> Maybe (Char,s) decode bs = do (c,cs) <- buncons bs return (choose (fromEnum c) cs) where choose c cs | c < 0x80 = (toEnum $ fromEnum c, 1) | c < 0xc0 = (replacement_char, 1) | c < 0xe0 = bytes2 (mask c 0x1f) cs | c < 0xf0 = bytes3 (mask c 0x0f) cs | c < 0xf8 = bytes4 (mask c 0x07) cs | otherwise = (replacement_char, 1) mask c m = fromEnum (c .&. m) combine acc r = shiftL acc 6 .|. fromEnum (r .&. 0x3f) follower acc r | r .&. 0xc0 == 0x80 = Just (combine acc r) follower _ _ = Nothing {-# INLINE get_follower #-} get_follower acc cs = do (x,xs) <- buncons cs acc1 <- follower acc x return (acc1,xs) bytes2 c cs = case get_follower c cs of Just (d, _) | d >= 0x80 -> (toEnum d, 2) | otherwise -> (replacement_char, 1) _ -> (replacement_char, 1) bytes3 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d, _) | (d >= 0x800 && d < 0xd800) || (d > 0xdfff && d < 0xfffe) -> (toEnum d, 3) | otherwise -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) bytes4 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d2, cs2) -> case get_follower d2 cs2 of Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4) | otherwise -> (replacement_char, 4) _ -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) -- | Split after a given number of characters. -- Negative values are treated as if they are 0. {-# SPECIALIZE splitAt :: Int -> B.ByteString -> (B.ByteString,B.ByteString) #-} {-# SPECIALIZE splitAt :: Int64 -> L.ByteString -> (L.ByteString,L.ByteString) #-} {-# SPECIALIZE splitAt :: Int -> [Word8] -> ([Word8],[Word8]) #-} splitAt :: UTF8Bytes b s => s -> b -> (b,b) splitAt x bs = loop 0 x bs where loop a n _ | n <= 0 = bsplit a bs loop a n bs1 = case decode bs1 of Just (_,y) -> loop (a+y) (n-1) (bdrop y bs1) Nothing -> (bs, empty) -- | @take n s@ returns the first @n@ characters of @s@. -- If @s@ has less than @n@ characters, then we return the whole of @s@. {-# INLINE take #-} take :: UTF8Bytes b s => s -> b -> b take n bs = fst (splitAt n bs) -- | @drop n s@ returns the @s@ without its first @n@ characters. -- If @s@ has less than @n@ characters, then we return an empty string. {-# INLINE drop #-} drop :: UTF8Bytes b s => s -> b -> b drop n bs = snd (splitAt n bs) -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as '\0xFFFD' to the predicate. {-# SPECIALIZE span :: (Char -> Bool) -> B.ByteString -> (B.ByteString,B.ByteString) #-} {-# SPECIALIZE span :: (Char -> Bool) -> L.ByteString -> (L.ByteString,L.ByteString) #-} {-# SPECIALIZE span :: (Char -> Bool) -> [Word8] -> ([Word8],[Word8]) #-} span :: UTF8Bytes b s => (Char -> Bool) -> b -> (b,b) span p bs = loop 0 bs where loop a cs = case decode cs of Just (c,n) | p c -> loop (a+n) (bdrop n cs) _ -> bsplit a bs -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that do not satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as '\0xFFFD' to the predicate. {-# INLINE break #-} break :: UTF8Bytes b s => (Char -> Bool) -> b -> (b,b) break p bs = span (not . p) bs -- | Get the first character of a byte string, if any. -- Malformed characters are replaced by '\0xFFFD'. {-# INLINE uncons #-} uncons :: UTF8Bytes b s => b -> Maybe (Char,b) uncons bs = do (c,n) <- decode bs return (c, bdrop n bs) -- | Traverse a bytestring (right biased). {-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> B.ByteString -> a #-} {-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> L.ByteString -> a #-} {-# SPECIALIZE foldr :: (Char -> a -> a) -> a -> [Word8] -> a #-} foldr :: UTF8Bytes b s => (Char -> a -> a) -> a -> b -> a foldr cons nil cs = case uncons cs of Just (a,as) -> cons a (foldr cons nil as) Nothing -> nil -- | Traverse a bytestring (left biased). -- This function is strict in the accumulator. {-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> B.ByteString -> a #-} {-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> L.ByteString -> a #-} {-# SPECIALIZE foldl :: (a -> Char -> a) -> a -> [Word8] -> a #-} foldl :: UTF8Bytes b s => (a -> Char -> a) -> a -> b -> a foldl add acc cs = case uncons cs of Just (a,as) -> let v = add acc a in seq v (foldl add v as) Nothing -> acc -- | Counts the number of characters encoded in the bytestring. -- Note that this includes replacement characters. {-# SPECIALIZE length :: B.ByteString -> Int #-} {-# SPECIALIZE length :: L.ByteString -> Int64 #-} {-# SPECIALIZE length :: [Word8] -> Int #-} length :: UTF8Bytes b s => b -> s length b = loop 0 b where loop n xs = case decode xs of Just (_,m) -> loop (n+1) (bdrop m xs) Nothing -> n -- | Split a string into a list of lines. -- Lines are terminated by '\n' or the end of the string. -- Empty lines may not be terminated by the end of the string. -- See also 'lines\''. {-# SPECIALIZE lines :: B.ByteString -> [B.ByteString] #-} {-# SPECIALIZE lines :: L.ByteString -> [L.ByteString] #-} {-# SPECIALIZE lines :: [Word8] -> [[Word8]] #-} lines :: UTF8Bytes b s => b -> [b] lines bs | null bs = [] lines bs = case elemIndex 10 bs of Just x -> let (xs,ys) = bsplit x bs in xs : lines (tail ys) Nothing -> [bs] -- | Split a string into a list of lines. -- Lines are terminated by '\n' or the end of the string. -- Empty lines may not be terminated by the end of the string. -- This function preserves the terminators. -- See also 'lines'. {-# SPECIALIZE lines' :: B.ByteString -> [B.ByteString] #-} {-# SPECIALIZE lines' :: L.ByteString -> [L.ByteString] #-} {-# SPECIALIZE lines' :: [Word8] -> [[Word8]] #-} lines' :: UTF8Bytes b s => b -> [b] lines' bs | null bs = [] lines' bs = case elemIndex 10 bs of Just x -> let (xs,ys) = bsplit (x+1) bs in xs : lines' ys Nothing -> [bs] ----------- -- Compatibility functions for base-2 unconsB :: B.ByteString -> Maybe (Word8,B.ByteString) unconsL :: L.ByteString -> Maybe (Word8,L.ByteString) #ifdef BYTESTRING_IN_BASE unconsB bs | B.null bs = Nothing | otherwise = Just (unsafeHead bs, unsafeTail bs) unconsL bs = case L.toChunks bs of (x:xs) | not (B.null x) -> Just (unsafeHead x, L.fromChunks (unsafeTail x:xs)) _ -> Nothing #else unconsB = B.uncons unconsL = L.uncons #endif utf8-string-1.0.1.1/Codec/Binary/UTF8/String.hs0000644000000000000000000001200112566371234017024 0ustar0000000000000000{-# LANGUAGE CPP #-} #if __GLASGOW_HASKELL__ >= 701 {-# LANGUAGE Trustworthy #-} #endif -- -- | -- Module : Codec.Binary.UTF8.String -- Copyright : (c) Eric Mertens 2007 -- License : BSD3-style (see LICENSE) -- -- Maintainer: emertens@galois.com -- Stability : experimental -- Portability : portable -- -- Support for encoding UTF8 Strings to and from @[Word8]@ -- module Codec.Binary.UTF8.String ( encode , decode , encodeString , decodeString , encodeChar , isUTF8Encoded , utf8Encode ) where import Data.Word (Word8,Word32) import Data.Bits ((.|.),(.&.),shiftL,shiftR) import Data.Char (chr,ord) default(Int) -- | Encode a string using 'encode' and store the result in a 'String'. encodeString :: String -> String encodeString xs = map (toEnum . fromEnum) (encode xs) -- | Decode a string using 'decode' using a 'String' as input. -- | This is not safe but it is necessary if UTF-8 encoded text -- | has been loaded into a 'String' prior to being decoded. decodeString :: String -> String decodeString xs = decode (map (toEnum . fromEnum) xs) replacement_character :: Char replacement_character = '\xfffd' -- | Encode a single Haskell Char to a list of Word8 values, in UTF8 format. encodeChar :: Char -> [Word8] encodeChar = map fromIntegral . go . ord where go oc | oc <= 0x7f = [oc] | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) , 0x80 + oc .&. 0x3f ] | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) , 0x80 + ((oc `shiftR` 6) .&. 0x3f) , 0x80 + oc .&. 0x3f ] | otherwise = [ 0xf0 + (oc `shiftR` 18) , 0x80 + ((oc `shiftR` 12) .&. 0x3f) , 0x80 + ((oc `shiftR` 6) .&. 0x3f) , 0x80 + oc .&. 0x3f ] -- | Encode a Haskell String to a list of Word8 values, in UTF8 format. encode :: String -> [Word8] encode = concatMap encodeChar -- -- | Decode a UTF8 string packed into a list of Word8 values, directly to String -- decode :: [Word8] -> String decode [ ] = "" decode (c:cs) | c < 0x80 = chr (fromEnum c) : decode cs | c < 0xc0 = replacement_character : decode cs | c < 0xe0 = multi1 | c < 0xf0 = multi_byte 2 0xf 0x800 | c < 0xf8 = multi_byte 3 0x7 0x10000 | c < 0xfc = multi_byte 4 0x3 0x200000 | c < 0xfe = multi_byte 5 0x1 0x4000000 | otherwise = replacement_character : decode cs where multi1 = case cs of c1 : ds | c1 .&. 0xc0 == 0x80 -> let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f) in if d >= 0x000080 then toEnum d : decode ds else replacement_character : decode ds _ -> replacement_character : decode cs multi_byte :: Int -> Word8 -> Int -> [Char] multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask)) where aux 0 rs acc | overlong <= acc && acc <= 0x10ffff && (acc < 0xd800 || 0xdfff < acc) && (acc < 0xfffe || 0xffff < acc) = chr acc : decode rs | otherwise = replacement_character : decode rs aux n (r:rs) acc | r .&. 0xc0 == 0x80 = aux (n-1) rs $ shiftL acc 6 .|. fromEnum (r .&. 0x3f) aux _ rs _ = replacement_character : decode rs -- | @utf8Encode str@ is a convenience function; checks to see if -- @str@ isn't UTF-8 encoded before doing so. Sometimes useful, but -- you are better off keeping track of the encoding so as to avoid -- the cost of checking. utf8Encode :: String -> String utf8Encode str | isUTF8Encoded str = str | otherwise = encodeString str -- | @isUTF8Encoded str@ tries to recognize input string as being in UTF-8 form. isUTF8Encoded :: String -> Bool isUTF8Encoded [] = True isUTF8Encoded (x:xs) = case ox of _ | ox < 0x80 -> isUTF8Encoded xs | ox > 0xff -> False | ox < 0xc0 -> False | ox < 0xe0 -> check1 | ox < 0xf0 -> check_byte 2 0xf 0 | ox < 0xf8 -> check_byte 3 0x7 0x10000 | ox < 0xfc -> check_byte 4 0x3 0x200000 | ox < 0xfe -> check_byte 5 0x1 0x4000000 | otherwise -> False where ox = toW32 x toW32 :: Char -> Word32 toW32 ch = fromIntegral (fromEnum ch) check1 = case xs of [] -> False c1 : ds | oc .&. 0xc0 /= 0x80 || d < 0x000080 -> False | otherwise -> isUTF8Encoded ds where oc = toW32 c1 d = ((ox .&. 0x1f) `shiftL` 6) .|. (oc .&. 0x3f) check_byte :: Int -> Word32 -> Word32 -> Bool check_byte i mask overlong = aux i xs (ox .&. mask) where aux 0 rs acc | overlong <= acc && acc <= 0x10ffff && (acc < 0xd800 || 0xdfff < acc) && (acc < 0xfffe || 0xffff < acc) = isUTF8Encoded rs | otherwise = False aux n (r:rs) acc | toW32 r .&. 0xc0 == 0x80 = aux (n-1) rs (acc `shiftL` 6 .|. (toW32 r .&. 0x3f)) aux _ _ _ = False utf8-string-1.0.1.1/Data/0000755000000000000000000000000012566371234013112 5ustar0000000000000000utf8-string-1.0.1.1/Data/ByteString/0000755000000000000000000000000012566371234015204 5ustar0000000000000000utf8-string-1.0.1.1/Data/ByteString/UTF8.hs0000644000000000000000000001674412566371234016302 0ustar0000000000000000{-# LANGUAGE CPP #-} #if __GLASGOW_HASKELL__ >= 701 {-# LANGUAGE Trustworthy #-} #endif -- -- | -- Module : Data.ByteString.UTF8 -- Copyright : (c) Iavor S. Diatchki 2009 -- License : BSD3-style (see LICENSE) -- -- Maintainer : emertens@galois.com -- Stability : experimental -- Portability : portable -- -- This module provides fast, validated encoding and decoding functions -- between 'ByteString's and 'String's. It does not exactly match the -- output of the Codec.Binary.UTF8.String output for invalid encodings -- as the number of replacement characters is sometimes longer. module Data.ByteString.UTF8 ( B.ByteString , decode , replacement_char , uncons , splitAt , take , drop , span , break , fromString , toString , foldl , foldr , length , lines , lines' ) where import Data.Bits import Data.Word import qualified Data.ByteString as B import Prelude hiding (take,drop,splitAt,span,break,foldr,foldl,length,lines) import Codec.Binary.UTF8.String(encode) import Codec.Binary.UTF8.Generic (buncons) -- | Converts a Haskell string into a UTF8 encoded bytestring. fromString :: String -> B.ByteString fromString xs = B.pack (encode xs) -- | Convert a UTF8 encoded bytestring into a Haskell string. -- Invalid characters are replaced with '\xFFFD'. toString :: B.ByteString -> String toString bs = foldr (:) [] bs -- | This character is used to mark errors in a UTF8 encoded string. replacement_char :: Char replacement_char = '\xfffd' -- | Try to extract a character from a byte string. -- Returns 'Nothing' if there are no more bytes in the byte string. -- Otherwise, it returns a decoded character and the number of -- bytes used in its representation. -- Errors are replaced by character '\0xFFFD'. -- XXX: Should we combine sequences of errors into a single replacement -- character? decode :: B.ByteString -> Maybe (Char,Int) decode bs = do (c,cs) <- buncons bs return (choose (fromEnum c) cs) where choose :: Int -> B.ByteString -> (Char, Int) choose c cs | c < 0x80 = (toEnum $ fromEnum c, 1) | c < 0xc0 = (replacement_char, 1) | c < 0xe0 = bytes2 (mask c 0x1f) cs | c < 0xf0 = bytes3 (mask c 0x0f) cs | c < 0xf8 = bytes4 (mask c 0x07) cs | otherwise = (replacement_char, 1) mask :: Int -> Int -> Int mask c m = fromEnum (c .&. m) combine :: Int -> Word8 -> Int combine acc r = shiftL acc 6 .|. fromEnum (r .&. 0x3f) follower :: Int -> Word8 -> Maybe Int follower acc r | r .&. 0xc0 == 0x80 = Just (combine acc r) follower _ _ = Nothing {-# INLINE get_follower #-} get_follower :: Int -> B.ByteString -> Maybe (Int, B.ByteString) get_follower acc cs = do (x,xs) <- buncons cs acc1 <- follower acc x return (acc1,xs) bytes2 :: Int -> B.ByteString -> (Char, Int) bytes2 c cs = case get_follower c cs of Just (d, _) | d >= 0x80 -> (toEnum d, 2) | otherwise -> (replacement_char, 1) _ -> (replacement_char, 1) bytes3 :: Int -> B.ByteString -> (Char, Int) bytes3 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d, _) | (d >= 0x800 && d < 0xd800) || (d > 0xdfff && d < 0xfffe) -> (toEnum d, 3) | otherwise -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) bytes4 :: Int -> B.ByteString -> (Char, Int) bytes4 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d2, cs2) -> case get_follower d2 cs2 of Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4) | otherwise -> (replacement_char, 4) _ -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) -- | Split after a given number of characters. -- Negative values are treated as if they are 0. splitAt :: Int -> B.ByteString -> (B.ByteString,B.ByteString) splitAt x bs = loop 0 x bs where loop a n _ | n <= 0 = B.splitAt a bs loop a n bs1 = case decode bs1 of Just (_,y) -> loop (a+y) (n-1) (B.drop y bs1) Nothing -> (bs, B.empty) -- | @take n s@ returns the first @n@ characters of @s@. -- If @s@ has less than @n@ characters, then we return the whole of @s@. take :: Int -> B.ByteString -> B.ByteString take n bs = fst (splitAt n bs) -- | @drop n s@ returns the @s@ without its first @n@ characters. -- If @s@ has less than @n@ characters, then we return an empty string. drop :: Int -> B.ByteString -> B.ByteString drop n bs = snd (splitAt n bs) -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as '\0xFFFD' to the predicate. span :: (Char -> Bool) -> B.ByteString -> (B.ByteString, B.ByteString) span p bs = loop 0 bs where loop a cs = case decode cs of Just (c,n) | p c -> loop (a+n) (B.drop n cs) _ -> B.splitAt a bs -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that do not satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as '\0xFFFD' to the predicate. break :: (Char -> Bool) -> B.ByteString -> (B.ByteString, B.ByteString) break p bs = span (not . p) bs -- | Get the first character of a byte string, if any. -- Malformed characters are replaced by '\0xFFFD'. uncons :: B.ByteString -> Maybe (Char,B.ByteString) uncons bs = do (c,n) <- decode bs return (c, B.drop n bs) -- | Traverse a bytestring (right biased). foldr :: (Char -> a -> a) -> a -> B.ByteString -> a foldr cons nil cs = case uncons cs of Just (a,as) -> cons a (foldr cons nil as) Nothing -> nil -- | Traverse a bytestring (left biased). -- This function is strict in the accumulator. foldl :: (a -> Char -> a) -> a -> B.ByteString -> a foldl add acc cs = case uncons cs of Just (a,as) -> let v = add acc a in seq v (foldl add v as) Nothing -> acc -- | Counts the number of characters encoded in the bytestring. -- Note that this includes replacement characters. length :: B.ByteString -> Int length b = loop 0 b where loop n xs = case decode xs of Just (_,m) -> loop (n+1) (B.drop m xs) Nothing -> n -- | Split a string into a list of lines. -- Lines are terminated by '\n' or the end of the string. -- Empty lines may not be terminated by the end of the string. -- See also 'lines\''. lines :: B.ByteString -> [B.ByteString] lines bs | B.null bs = [] lines bs = case B.elemIndex 10 bs of Just x -> let (xs,ys) = B.splitAt x bs in xs : lines (B.tail ys) Nothing -> [bs] -- | Split a string into a list of lines. -- Lines are terminated by '\n' or the end of the string. -- Empty lines may not be terminated by the end of the string. -- This function preserves the terminators. -- See also 'lines'. lines' :: B.ByteString -> [B.ByteString] lines' bs | B.null bs = [] lines' bs = case B.elemIndex 10 bs of Just x -> let (xs,ys) = B.splitAt (x+1) bs in xs : lines' ys Nothing -> [bs] utf8-string-1.0.1.1/Data/ByteString/Lazy/0000755000000000000000000000000012566371234016123 5ustar0000000000000000utf8-string-1.0.1.1/Data/ByteString/Lazy/UTF8.hs0000644000000000000000000002557512566371234017223 0ustar0000000000000000{-# LANGUAGE CPP, BangPatterns #-} #if __GLASGOW_HASKELL__ >= 701 {-# LANGUAGE Trustworthy #-} #endif -- -- | -- Module : Data.ByteString.Lazy.UTF8 -- Copyright : (c) Iavor S. Diatchki 2009 -- License : BSD3-style (see LICENSE) -- -- Maintainer : emertens@galois.com -- Stability : experimental -- Portability : portable -- -- This module provides fast, validated encoding and decoding functions -- between 'ByteString's and 'String's. It does not exactly match the -- output of the Codec.Binary.UTF8.String output for invalid encodings -- as the number of replacement characters is sometimes longer. module Data.ByteString.Lazy.UTF8 ( B.ByteString , decode , replacement_char , uncons , splitAt , take , drop , span , break , fromString , toString , foldl , foldr , length , lines , lines' ) where import Data.Bits import Data.Word import Data.Int import Foreign.Storable import Foreign.Ptr import Foreign.ForeignPtr import Data.Char (ord) import Control.Exception (assert) import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Internal as B import qualified Data.ByteString.Internal as S import Prelude hiding (take,drop,splitAt,span,break,foldr,foldl,length,lines) import Codec.Binary.UTF8.Generic (buncons) #if MIN_VERSION_base(4,4,0) import System.IO.Unsafe (unsafeDupablePerformIO) #else import GHC.IO (unsafeDupablePerformIO) #endif --------------------------------------------------------------------- -- ENCODING -- | Converts a Haskell string into a UTF8 encoded bytestring. fromString :: String -> B.ByteString fromString [] = B.empty fromString xs0 = packChunks 32 xs0 where packChunks n xs = case packUptoLenBytes n xs of (bs, [] ) -> B.chunk bs B.Empty (bs, xs') -> B.Chunk bs (packChunks (min (n * 2) B.smallChunkSize) xs') packUptoLenBytes :: Int -> String -> (S.ByteString, String) packUptoLenBytes len xs = unsafeCreateUptoN' len $ \ptr -> do (end, xs') <- go ptr (ptr `plusPtr` (len-4)) xs return (end `minusPtr` ptr, xs') -- end is the last position at which you can write a whole 4 byte sequence safely go :: Ptr Word8 -> Ptr Word8 -> String -> IO (Ptr Word8, String) go !ptr !end xs | ptr > end = return (ptr, xs) go !ptr !_ [] = return (ptr, []) go !ptr !end (x:xs) | x <= '\x7f' = poke ptr (S.c2w x) >> go (plusPtr ptr 1) end xs | otherwise = case ord x of oc | oc <= 0x7ff -> do poke ptr $ fromIntegral $ 0xc0 + (oc `shiftR` 6) pokeElemOff ptr 1 $ fromIntegral $ 0x80 + oc .&. 0x3f go (plusPtr ptr 2) end xs | oc <= 0xffff -> do poke ptr $ fromIntegral $ 0xe0 + (oc `shiftR` 12) pokeElemOff ptr 1 $ fromIntegral $ 0x80 + ((oc `shiftR` 6) .&. 0x3f) pokeElemOff ptr 2 $ fromIntegral $ 0x80 + oc .&. 0x3f go (plusPtr ptr 3) end xs | otherwise -> do poke ptr $ fromIntegral $ 0xf0 + (oc `shiftR` 18) pokeElemOff ptr 1 $ fromIntegral $ 0x80 + ((oc `shiftR` 12) .&. 0x3f) pokeElemOff ptr 2 $ fromIntegral $ 0x80 + ((oc `shiftR` 6) .&. 0x3f) pokeElemOff ptr 3 $ fromIntegral $ 0x80 + oc .&. 0x3f go (plusPtr ptr 4) end xs --------------------------------------------------------------------- -- DECODING -- | Convert a UTF8 encoded bytestring into a Haskell string. -- Invalid characters are replaced with '\xFFFD'. toString :: B.ByteString -> String toString bs = foldr (:) [] bs -- | This character is used to mark errors in a UTF8 encoded string. replacement_char :: Char replacement_char = '\xfffd' -- | Try to extract a character from a byte string. -- Returns 'Nothing' if there are no more bytes in the byte string. -- Otherwise, it returns a decoded character and the number of -- bytes used in its representation. -- Errors are replaced by character '\0xFFFD'. -- XXX: Should we combine sequences of errors into a single replacement -- character? decode :: B.ByteString -> Maybe (Char,Int64) decode bs = do (c,cs) <- buncons bs return (choose (fromEnum c) cs) where choose :: Int -> B.ByteString -> (Char, Int64) choose c cs | c < 0x80 = (toEnum $ fromEnum c, 1) | c < 0xc0 = (replacement_char, 1) | c < 0xe0 = bytes2 (mask c 0x1f) cs | c < 0xf0 = bytes3 (mask c 0x0f) cs | c < 0xf8 = bytes4 (mask c 0x07) cs | otherwise = (replacement_char, 1) mask :: Int -> Int -> Int mask c m = fromEnum (c .&. m) combine :: Int -> Word8 -> Int combine acc r = shiftL acc 6 .|. fromEnum (r .&. 0x3f) follower :: Int -> Word8 -> Maybe Int follower acc r | r .&. 0xc0 == 0x80 = Just (combine acc r) follower _ _ = Nothing {-# INLINE get_follower #-} get_follower :: Int -> B.ByteString -> Maybe (Int, B.ByteString) get_follower acc cs = do (x,xs) <- buncons cs acc1 <- follower acc x return (acc1,xs) bytes2 :: Int -> B.ByteString -> (Char, Int64) bytes2 c cs = case get_follower c cs of Just (d, _) | d >= 0x80 -> (toEnum d, 2) | otherwise -> (replacement_char, 1) _ -> (replacement_char, 1) bytes3 :: Int -> B.ByteString -> (Char, Int64) bytes3 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d, _) | (d >= 0x800 && d < 0xd800) || (d > 0xdfff && d < 0xfffe) -> (toEnum d, 3) | otherwise -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) bytes4 :: Int -> B.ByteString -> (Char, Int64) bytes4 c cs = case get_follower c cs of Just (d1, cs1) -> case get_follower d1 cs1 of Just (d2, cs2) -> case get_follower d2 cs2 of Just (d,_) | d >= 0x10000 && d < 0x110000 -> (toEnum d, 4) | otherwise -> (replacement_char, 4) _ -> (replacement_char, 3) _ -> (replacement_char, 2) _ -> (replacement_char, 1) {-# INLINE decode #-} -- | Split after a given number of characters. -- Negative values are treated as if they are 0. splitAt :: Int64 -> B.ByteString -> (B.ByteString,B.ByteString) splitAt x bs = loop 0 x bs where loop !a n _ | n <= 0 = B.splitAt a bs loop !a n bs1 = case decode bs1 of Just (_,y) -> loop (a+y) (n-1) (B.drop y bs1) Nothing -> (bs, B.empty) -- | @take n s@ returns the first @n@ characters of @s@. -- If @s@ has less than @n@ characters, then we return the whole of @s@. take :: Int64 -> B.ByteString -> B.ByteString take x bs = loop 0 x bs where loop !a n _ | n <= 0 = B.take a bs loop !a n bs1 = case decode bs1 of Just (_,y) -> loop (a+y) (n-1) (B.drop y bs1) Nothing -> bs -- | @drop n s@ returns the @s@ without its first @n@ characters. -- If @s@ has less than @n@ characters, then we return an empty string. drop :: Int64 -> B.ByteString -> B.ByteString drop x bs = loop 0 x bs where loop !a n _ | n <= 0 = B.drop a bs loop !a n bs1 = case decode bs1 of Just (_,y) -> loop (a+y) (n-1) (B.drop y bs1) Nothing -> B.empty -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as '\0xFFFD' to the predicate. span :: (Char -> Bool) -> B.ByteString -> (B.ByteString, B.ByteString) span p bs = loop 0 bs where loop a cs = case decode cs of Just (c,n) | p c -> loop (a+n) (B.drop n cs) _ -> B.splitAt a bs -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that do not satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as '\0xFFFD' to the predicate. break :: (Char -> Bool) -> B.ByteString -> (B.ByteString, B.ByteString) break p bs = span (not . p) bs -- | Get the first character of a byte string, if any. -- Malformed characters are replaced by '\0xFFFD'. uncons :: B.ByteString -> Maybe (Char,B.ByteString) uncons bs = do (c,n) <- decode bs return (c, B.drop n bs) -- | Traverse a bytestring (right biased). foldr :: (Char -> a -> a) -> a -> B.ByteString -> a foldr cons nil cs = case uncons cs of Just (a,as) -> cons a (foldr cons nil as) Nothing -> nil -- | Traverse a bytestring (left biased). -- This function is strict in the accumulator. foldl :: (a -> Char -> a) -> a -> B.ByteString -> a foldl add acc cs = case uncons cs of Just (a,as) -> let v = add acc a in seq v (foldl add v as) Nothing -> acc -- | Counts the number of characters encoded in the bytestring. -- Note that this includes replacement characters. length :: B.ByteString -> Int length b = loop 0 b where loop n xs = case decode xs of Just (_,m) -> loop (n+1) (B.drop m xs) Nothing -> n -- | Split a string into a list of lines. -- Lines are terminated by '\n' or the end of the string. -- Empty lines may not be terminated by the end of the string. -- See also 'lines\''. lines :: B.ByteString -> [B.ByteString] lines bs | B.null bs = [] lines bs = case B.elemIndex 10 bs of Just x -> let (xs,ys) = B.splitAt x bs in xs : lines (B.tail ys) Nothing -> [bs] -- | Split a string into a list of lines. -- Lines are terminated by '\n' or the end of the string. -- Empty lines may not be terminated by the end of the string. -- This function preserves the terminators. -- See also 'lines'. lines' :: B.ByteString -> [B.ByteString] lines' bs | B.null bs = [] lines' bs = case B.elemIndex 10 bs of Just x -> let (xs,ys) = B.splitAt (x+1) bs in xs : lines' ys Nothing -> [bs] --------------------------------------------------------------------- -- COPIED FROM BYTESTRING -- These functions are copied verbatum from Data.ByteString.Internal -- I suspect their lack of export is an oversight unsafeCreateUptoN' :: Int -> (Ptr Word8 -> IO (Int, a)) -> (S.ByteString, a) unsafeCreateUptoN' l f = unsafeDupablePerformIO (createUptoN' l f) {-# INLINE unsafeCreateUptoN' #-} -- | Create ByteString of up to size @l@ and use action @f@ to fill it's contents which returns its true size. createUptoN' :: Int -> (Ptr Word8 -> IO (Int, a)) -> IO (S.ByteString, a) createUptoN' l f = do fp <- S.mallocByteString l (l', res) <- withForeignPtr fp $ \p -> f p assert (l' <= l) $ return (S.PS fp 0 l', res) {-# INLINE createUptoN' #-} utf8-string-1.0.1.1/Data/String/0000755000000000000000000000000012566371234014360 5ustar0000000000000000utf8-string-1.0.1.1/Data/String/UTF8.hs0000644000000000000000000001420512566371234015444 0ustar0000000000000000{-# LANGUAGE CPP #-} #if __GLASGOW_HASKELL__ >= 701 {-# LANGUAGE Trustworthy #-} #endif -- -- | -- Module : Data.String.UTF8 -- Copyright : (c) Iavor S. Diatchki 2009 -- License : BSD3-style (see LICENSE) -- -- Maintainer : emertens@galois.com -- Stability : experimental -- Portability : portable -- {-# LANGUAGE MultiParamTypeClasses, UndecidableInstances #-} module Data.String.UTF8 ( -- * Representation UTF8 , UTF8Bytes() , fromString , toString , fromRep , toRep , G.replacement_char -- * Character based operations , uncons , splitAt , take , drop , span , break , foldl , foldr , length , lines , lines' -- * Representation based operations , null , decode , byteSplitAt , byteTake , byteDrop ) where import Prelude hiding (null,take,drop,span,break ,foldl,foldr,length,lines,splitAt) import qualified Codec.Binary.UTF8.Generic as G import Codec.Binary.UTF8.Generic (UTF8Bytes) -- | The type of strings that are represented using the UTF8 encoding. -- The parameter is the type of the container for the representation. newtype UTF8 string = Str string deriving (Eq,Ord) -- XXX: Is this OK? instance UTF8Bytes string index => Show (UTF8 string) where show x = show (toString x) fromRep :: string -> UTF8 string fromRep = Str toRep :: UTF8 string -> string toRep (Str x) = x -- | Converts a Haskell string into a UTF8 encoded string. -- Complexity: linear. fromString :: UTF8Bytes string index => String -> UTF8 string fromString xs = Str (G.fromString xs) -- | Convert a UTF8 encoded string into a Haskell string. -- Invalid characters are replaced by 'replacement_char'. -- Complexity: linear. toString :: UTF8Bytes string index => UTF8 string -> String toString (Str xs) = G.toString xs -- | Checks if there are no more bytes in the underlying representation. null :: UTF8Bytes string index => UTF8 string -> Bool null (Str x) = G.null x -- | Split after a given number of characters. -- Negative values are treated as if they are 0. -- See also 'bytesSplitAt'. splitAt :: UTF8Bytes string index => index -> UTF8 string -> (UTF8 string, UTF8 string) splitAt x (Str bs) = case G.splitAt x bs of (s1,s2) -> (Str s1, Str s2) -- | Split after a given number of bytes in the underlying representation. -- See also 'splitAt'. byteSplitAt :: UTF8Bytes string index => index -> UTF8 string -> (UTF8 string, UTF8 string) byteSplitAt n (Str x) = case G.bsplit n x of (as,bs) -> (Str as, Str bs) -- | Take only the given number of bytes from the underlying representation. -- See also 'take'. byteTake :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string byteTake n (Str x) = Str (fst (G.bsplit n x)) -- | Drop the given number of bytes from the underlying representation. -- See also 'drop'. byteDrop :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string byteDrop n (Str x) = Str (G.bdrop n x) -- | @take n s@ returns the first @n@ characters of @s@. -- If @s@ has less than @n@ characters, then we return the whole of @s@. take :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string take n (Str bs) = Str (G.take n bs) -- | @drop n s@ returns the @s@ without its first @n@ characters. -- If @s@ has less than @n@ characters, then we return an empty string. drop :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string drop n (Str bs) = Str (G.drop n bs) -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as '\0xFFFD' to the predicate. span :: UTF8Bytes string index => (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string) span p (Str bs) = case G.span p bs of (s1,s2) -> (Str s1, Str s2) -- | Split a string into two parts: the first is the longest prefix -- that contains only characters that do not satisfy the predicate; the second -- part is the rest of the string. -- Invalid characters are passed as 'replacement_char' to the predicate. break :: UTF8Bytes string index => (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string) break p (Str bs) = case G.break p bs of (s1,s2) -> (Str s1, Str s2) -- | Get the first character of a byte string, if any. -- Invalid characters are replaced by 'replacement_char'. uncons :: UTF8Bytes string index => UTF8 string -> Maybe (Char, UTF8 string) uncons (Str x) = do (c,y) <- G.uncons x return (c, Str y) -- | Extract the first character for the underlying representation, -- if one is available. It also returns the number of bytes used -- in the representation of the character. -- See also 'uncons', 'dropBytes'. decode :: UTF8Bytes string index => UTF8 string -> Maybe (Char, index) decode (Str x) = G.decode x -- | Traverse a bytestring (right biased). foldr :: UTF8Bytes string index => (Char -> a -> a) -> a -> UTF8 string -> a foldr cons nil (Str cs) = G.foldr cons nil cs -- | Traverse a bytestring (left biased). -- This function is strict in the accumulator. foldl :: UTF8Bytes string index => (a -> Char -> a) -> a -> UTF8 string -> a foldl add acc (Str cs) = G.foldl add acc cs -- | Counts the number of characters encoded in the bytestring. -- Note that this includes replacement characters. -- The function is linear in the number of bytes in the representation. length :: UTF8Bytes string index => UTF8 string -> index length (Str b) = G.length b -- | Split a string into a list of lines. -- Lines are terminated by '\n' or the end of the string. -- Empty lines may not be terminated by the end of the string. -- See also 'lines\''. lines :: UTF8Bytes string index => UTF8 string -> [UTF8 string] lines (Str b) = map Str (G.lines b) -- XXX: unnecessary map -- | Split a string into a list of lines. -- Lines are terminated by '\n' or the end of the string. -- Empty lines may not be terminated by the end of the string. -- This function preserves the terminators. -- See also 'lines'. lines' :: UTF8Bytes string index => UTF8 string -> [UTF8 string] lines' (Str x) = map Str (G.lines' x) -- XXX: unnecessary map