{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE NoImplicitPrelude
, BangPatterns
, NondecreasingIndentation
, MagicHash
#-}
{-# OPTIONS_GHC -funbox-strict-fields #-}
module GHC.IO.Encoding.UTF32 (
utf32, mkUTF32,
utf32_decode,
utf32_encode,
utf32be, mkUTF32be,
utf32be_decode,
utf32be_encode,
utf32le, mkUTF32le,
utf32le_decode,
utf32le_encode,
) where
import GHC.Base
import GHC.Real
import GHC.Num
import GHC.IO.Buffer
import GHC.IO.Encoding.Failure
import GHC.IO.Encoding.Types
import GHC.Word
import Data.Bits
import Data.Maybe
import GHC.IORef
utf32 :: TextEncoding
utf32 = mkUTF32 ErrorOnCodingFailure
mkUTF32 :: CodingFailureMode -> TextEncoding
mkUTF32 cfm = TextEncoding { textEncodingName = "UTF-32",
mkTextDecoder = utf32_DF cfm,
mkTextEncoder = utf32_EF cfm }
utf32_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer))
utf32_DF cfm = do
seen_bom <- newIORef Nothing
return (BufferCodec {
encode = utf32_decode seen_bom,
recover = recoverDecode cfm,
close = return (),
getState = readIORef seen_bom,
setState = writeIORef seen_bom
})
utf32_EF :: CodingFailureMode -> IO (TextEncoder Bool)
utf32_EF cfm = do
done_bom <- newIORef False
return (BufferCodec {
encode = utf32_encode done_bom,
recover = recoverEncode cfm,
close = return (),
getState = readIORef done_bom,
setState = writeIORef done_bom
})
utf32_encode :: IORef Bool -> EncodeBuffer
utf32_encode done_bom input
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os }
= do
b <- readIORef done_bom
if b then utf32_native_encode input output
else if os - ow < 4
then return (OutputUnderflow, input,output)
else do
writeIORef done_bom True
writeWord8Buf oraw ow bom0
writeWord8Buf oraw (ow+1) bom1
writeWord8Buf oraw (ow+2) bom2
writeWord8Buf oraw (ow+3) bom3
utf32_native_encode input output{ bufR = ow+4 }
utf32_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer
utf32_decode seen_bom
input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ }
output
= do
mb <- readIORef seen_bom
case mb of
Just decode -> decode input output
Nothing ->
if iw - ir < 4 then return (InputUnderflow, input,output) else do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
c3 <- readWord8Buf iraw (ir+3)
case () of
_ | c0 == bom0 && c1 == bom1 && c2 == bom2 && c3 == bom3 -> do
writeIORef seen_bom (Just utf32be_decode)
utf32be_decode input{ bufL= ir+4 } output
_ | c0 == bom3 && c1 == bom2 && c2 == bom1 && c3 == bom0 -> do
writeIORef seen_bom (Just utf32le_decode)
utf32le_decode input{ bufL= ir+4 } output
| otherwise -> do
writeIORef seen_bom (Just utf32_native_decode)
utf32_native_decode input output
bom0, bom1, bom2, bom3 :: Word8
bom0 = 0
bom1 = 0
bom2 = 0xfe
bom3 = 0xff
utf32_native_decode :: DecodeBuffer
utf32_native_decode = utf32be_decode
utf32_native_encode :: EncodeBuffer
utf32_native_encode = utf32be_encode
utf32be :: TextEncoding
utf32be = mkUTF32be ErrorOnCodingFailure
mkUTF32be :: CodingFailureMode -> TextEncoding
mkUTF32be cfm = TextEncoding { textEncodingName = "UTF-32BE",
mkTextDecoder = utf32be_DF cfm,
mkTextEncoder = utf32be_EF cfm }
utf32be_DF :: CodingFailureMode -> IO (TextDecoder ())
utf32be_DF cfm =
return (BufferCodec {
encode = utf32be_decode,
recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
utf32be_EF :: CodingFailureMode -> IO (TextEncoder ())
utf32be_EF cfm =
return (BufferCodec {
encode = utf32be_encode,
recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
utf32le :: TextEncoding
utf32le = mkUTF32le ErrorOnCodingFailure
mkUTF32le :: CodingFailureMode -> TextEncoding
mkUTF32le cfm = TextEncoding { textEncodingName = "UTF-32LE",
mkTextDecoder = utf32le_DF cfm,
mkTextEncoder = utf32le_EF cfm }
utf32le_DF :: CodingFailureMode -> IO (TextDecoder ())
utf32le_DF cfm =
return (BufferCodec {
encode = utf32le_decode,
recover = recoverDecode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
utf32le_EF :: CodingFailureMode -> IO (TextEncoder ())
utf32le_EF cfm =
return (BufferCodec {
encode = utf32le_encode,
recover = recoverEncode cfm,
close = return (),
getState = return (),
setState = const $ return ()
})
utf32be_decode :: DecodeBuffer
utf32be_decode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
| ow >= os = done OutputUnderflow ir ow
| iw - ir < 4 = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
c3 <- readWord8Buf iraw (ir+3)
let x1 = chr4 c0 c1 c2 c3
if not (validate x1) then invalid else do
ow' <- writeCharBuf oraw ow x1
loop (ir+4) ow'
where
invalid = done InvalidSequence ir ow
done why !ir !ow = return (why,
if ir == iw then input{ bufL=0, bufR=0 }
else input{ bufL=ir },
output{ bufR=ow })
in
loop ir0 ow0
utf32le_decode :: DecodeBuffer
utf32le_decode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
loop !ir !ow
| ow >= os = done OutputUnderflow ir ow
| iw - ir < 4 = done InputUnderflow ir ow
| otherwise = do
c0 <- readWord8Buf iraw ir
c1 <- readWord8Buf iraw (ir+1)
c2 <- readWord8Buf iraw (ir+2)
c3 <- readWord8Buf iraw (ir+3)
let x1 = chr4 c3 c2 c1 c0
if not (validate x1) then invalid else do
ow' <- writeCharBuf oraw ow x1
loop (ir+4) ow'
where
invalid = done InvalidSequence ir ow
done why !ir !ow = return (why,
if ir == iw then input{ bufL=0, bufR=0 }
else input{ bufL=ir },
output{ bufR=ow })
in
loop ir0 ow0
utf32be_encode :: EncodeBuffer
utf32be_encode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
done why !ir !ow = return (why,
if ir == iw then input{ bufL=0, bufR=0 }
else input{ bufL=ir },
output{ bufR=ow })
loop !ir !ow
| ir >= iw = done InputUnderflow ir ow
| os - ow < 4 = done OutputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
if isSurrogate c then done InvalidSequence ir ow else do
let (c0,c1,c2,c3) = ord4 c
writeWord8Buf oraw ow c0
writeWord8Buf oraw (ow+1) c1
writeWord8Buf oraw (ow+2) c2
writeWord8Buf oraw (ow+3) c3
loop ir' (ow+4)
in
loop ir0 ow0
utf32le_encode :: EncodeBuffer
utf32le_encode
input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os }
= let
done why !ir !ow = return (why,
if ir == iw then input{ bufL=0, bufR=0 }
else input{ bufL=ir },
output{ bufR=ow })
loop !ir !ow
| ir >= iw = done InputUnderflow ir ow
| os - ow < 4 = done OutputUnderflow ir ow
| otherwise = do
(c,ir') <- readCharBuf iraw ir
if isSurrogate c then done InvalidSequence ir ow else do
let (c0,c1,c2,c3) = ord4 c
writeWord8Buf oraw ow c3
writeWord8Buf oraw (ow+1) c2
writeWord8Buf oraw (ow+2) c1
writeWord8Buf oraw (ow+3) c0
loop ir' (ow+4)
in
loop ir0 ow0
chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
C# (chr# (z1# +# z2# +# z3# +# z4#))
where
!y1# = word2Int# x1#
!y2# = word2Int# x2#
!y3# = word2Int# x3#
!y4# = word2Int# x4#
!z1# = uncheckedIShiftL# y1# 24#
!z2# = uncheckedIShiftL# y2# 16#
!z3# = uncheckedIShiftL# y3# 8#
!z4# = y4#
{-# INLINE chr4 #-}
ord4 :: Char -> (Word8,Word8,Word8,Word8)
ord4 c = (fromIntegral (x `shiftR` 24),
fromIntegral (x `shiftR` 16),
fromIntegral (x `shiftR` 8),
fromIntegral x)
where
x = ord c
{-# INLINE ord4 #-}
validate :: Char -> Bool
validate c = (x1 >= 0x0 && x1 < 0xD800) || (x1 > 0xDFFF && x1 <= 0x10FFFF)
where x1 = ord c
{-# INLINE validate #-}