{-# LANGUAGE DeriveDataTypeable #-}
module Data.Encoding.UTF8 where
import Control.Throws
import Data.Char
import Data.Bits
import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Data.Typeable
data UTF8 = UTF8
| UTF8Strict
deriving (UTF8 -> UTF8 -> Bool
(UTF8 -> UTF8 -> Bool) -> (UTF8 -> UTF8 -> Bool) -> Eq UTF8
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: UTF8 -> UTF8 -> Bool
== :: UTF8 -> UTF8 -> Bool
$c/= :: UTF8 -> UTF8 -> Bool
/= :: UTF8 -> UTF8 -> Bool
Eq,Int -> UTF8 -> ShowS
[UTF8] -> ShowS
UTF8 -> String
(Int -> UTF8 -> ShowS)
-> (UTF8 -> String) -> ([UTF8] -> ShowS) -> Show UTF8
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> UTF8 -> ShowS
showsPrec :: Int -> UTF8 -> ShowS
$cshow :: UTF8 -> String
show :: UTF8 -> String
$cshowList :: [UTF8] -> ShowS
showList :: [UTF8] -> ShowS
Show,Typeable)
instance Encoding UTF8 where
encodeChar :: forall (m :: * -> *). ByteSink m => UTF8 -> Char -> m ()
encodeChar UTF8
_ Char
c
| Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x0000007F = Int -> m ()
p8 Int
n
| Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x000007FF = do
Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0xC0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
6)
Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
| Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x0000FFFF = do
Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0xE0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
12)
Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
6) Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
| Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x0010FFFF = do
Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0xF0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
18)
Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
12) Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Int
n Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` Int
6) Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
Int -> m ()
p8 (Int -> m ()) -> Int -> m ()
forall a b. (a -> b) -> a -> b
$ Int
0x80 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Int
n Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3F)
| Bool
otherwise = EncodingException -> m ()
forall a. EncodingException -> m a
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Char -> EncodingException
HasNoRepresentation Char
c)
where
n :: Int
n = Char -> Int
ord Char
c
p8 :: Int -> m ()
p8 = Word8 -> m ()
forall (m :: * -> *). ByteSink m => Word8 -> m ()
pushWord8(Word8 -> m ()) -> (Int -> Word8) -> Int -> m ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
.Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral
encodeable :: UTF8 -> Char -> Bool
encodeable UTF8
_ Char
c = Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
'\x10FFFF'
decodeChar :: forall (m :: * -> *). ByteSource m => UTF8 -> m Char
decodeChar UTF8
UTF8 = do
w1 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
case () of
()
_
| Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0x7F -> Char -> m Char
forall a. a -> m a
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> m Char) -> Char -> m Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
w1
| Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xBF -> DecodingException -> m Char
forall a. DecodingException -> m a
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
| Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xDF -> do
w2 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
return $ chr $
((fromIntegral $ w1 .&. 0x1F) `shiftL` 6)
.|. (fromIntegral $ w2 .&. 0x3F)
| Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xEF -> do
w2 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
w3 <- fetchWord8
let v1 = Word8
w1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x0F
v2 = Word8
w2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
v3 = Word8
w3 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
return $ chr $
((fromIntegral v1) `shiftL` 12)
.|. ((fromIntegral v2) `shiftL` 6)
.|. (fromIntegral v3)
| Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xF7 -> do
w2 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
let v1 = Word8
w1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x07
v2 = Word8
w2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
v3 = Word8
w3 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
v4 = Word8
w4 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
v = ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
18)
Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v2) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
12)
Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v3) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v4)
if v <= 0x10FFFF
then return $ chr v
else throwException (IllegalRepresentation [w1,w2,w3,w4])
| Bool
otherwise -> DecodingException -> m Char
forall a. DecodingException -> m a
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
decodeChar UTF8
UTF8Strict = do
w1 <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
case () of
()
_
| Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0x7F -> Char -> m Char
forall a. a -> m a
forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> m Char) -> Char -> m Char
forall a b. (a -> b) -> a -> b
$ Int -> Char
chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
w1
| Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xBF -> DecodingException -> m Char
forall a. DecodingException -> m a
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
| Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xDF -> do
w2 <- m Word8
fetchExtend8
let v1 = Word8
w1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x1F
if v1 <= 1
then throwException (IllegalRepresentation [w1,w2])
else return $ chr $
((fromIntegral v1) `shiftL` 6)
.|. (fromIntegral $ w2 .&. 0x3F)
| Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xEF -> do
w2 <- m Word8
fetchExtend8
w3 <- fetchExtend8
let v1 = Word8
w1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x0F
v2 = Word8
w2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
v3 = Word8
w3 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
if v1 == 0 && v2 < 0x20
then throwException (IllegalRepresentation [w1,w2,w3])
else return $ chr $
((fromIntegral v1) `shiftL` 12)
.|. ((fromIntegral v2) `shiftL` 6)
.|. (fromIntegral v3)
| Word8
w1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
<= Word8
0xF7 -> do
w2 <- m Word8
fetchExtend8
w3 <- fetchExtend8
w4 <- fetchExtend8
let v1 = Word8
w1 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x07
v2 = Word8
w2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
v3 = Word8
w3 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
v4 = Word8
w4 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.&. Word8
0x3F
v = ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v1) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
18)
Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v2) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
12)
Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v3) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6)
Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
v4)
if v1 == 0 && v2 < 0x10
then throwException (IllegalRepresentation [w1,w2,w3,w4])
else (if v <= 0x10FFFF
then return $ chr v
else throwException (IllegalRepresentation [w1,w2,w3,w4]))
| Bool
otherwise -> DecodingException -> m Char
forall a. DecodingException -> m a
forall e (m :: * -> *) a. Throws e m => e -> m a
throwException (Word8 -> DecodingException
IllegalCharacter Word8
w1)
where
invalidExtend :: a -> Bool
invalidExtend a
wrd = a
wrd a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
0xC0 a -> a -> Bool
forall a. Eq a => a -> a -> Bool
/= a
0x80
fetchExtend8 :: m Word8
fetchExtend8 = do
w <- m Word8
forall (m :: * -> *). ByteSource m => m Word8
fetchWord8
if invalidExtend w
then throwException (IllegalCharacter w)
else return w