-- |
-- Module: M.NBT.Internal.JS
-- Description: Java string encoding (CESU-8) support
-- Copyright: (c) axionbuster, 2025
-- License: BSD-3-Clause
--
-- Implements Java's Modified UTF-8 (CESU-8) encoding and decoding for
-- string handling in NBT format.
--
-- See: https://docs.oracle.com/javase/8/docs/api/java/io/DataInput.html#modified-utf-8
module M.NBT.Internal.JS
  ( JS (..),
    textascesu8,
    cesu8astext,
    tocesu8,
    tocesu8p,
    fromcesu8,
    fromcesu8p,
  )
where

import Control.DeepSeq
import Control.Exception
import Control.Monad.Fix
import Data.Bits
import Data.ByteString (ByteString)
import Data.ByteString qualified as B
import Data.ByteString.Builder
import Data.Data
import Data.Function
import Data.Functor
import Data.Hashable
import Data.String
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf16LEWith, encodeUtf8)
import Data.Text.Encoding.Error (UnicodeException (DecodeError))
import Data.Word
import FlatParse.Stateful qualified as F
import GHC.Generics
import Language.Haskell.TH.Syntax (Lift)
import M.Pack
import System.IO.Unsafe
import Text.Printf

-- | Java's CESU-8 encoding/decoding
--
-- this newtype is purely for modulation of encoding and decoding.
-- it is not intended to be used directly in the public API
--
-- use 'textascesu8' and 'cesu8astext' to convert between 'Text'
-- and 'ByteString'
--
-- see:
--
-- * https://en.wikipedia.org/wiki/UTF-8#CESU-8
-- * https://docs.oracle.com/en/java/javase/18/docs/api/java.base/java/io/DataInput.html#modified-utf-8
newtype JS = JS {JS -> Text
getjs :: Text}
  deriving stock ((forall x. JS -> Rep JS x)
-> (forall x. Rep JS x -> JS) -> Generic JS
forall x. Rep JS x -> JS
forall x. JS -> Rep JS x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cfrom :: forall x. JS -> Rep JS x
from :: forall x. JS -> Rep JS x
$cto :: forall x. Rep JS x -> JS
to :: forall x. Rep JS x -> JS
Generic, Typeable, Typeable JS
Typeable JS =>
(forall (c :: * -> *).
 (forall d b. Data d => c (d -> b) -> d -> c b)
 -> (forall g. g -> c g) -> JS -> c JS)
-> (forall (c :: * -> *).
    (forall b r. Data b => c (b -> r) -> c r)
    -> (forall r. r -> c r) -> Constr -> c JS)
-> (JS -> Constr)
-> (JS -> DataType)
-> (forall (t :: * -> *) (c :: * -> *).
    Typeable t =>
    (forall d. Data d => c (t d)) -> Maybe (c JS))
-> (forall (t :: * -> * -> *) (c :: * -> *).
    Typeable t =>
    (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c JS))
-> ((forall b. Data b => b -> b) -> JS -> JS)
-> (forall r r'.
    (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> JS -> r)
-> (forall r r'.
    (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> JS -> r)
-> (forall u. (forall d. Data d => d -> u) -> JS -> [u])
-> (forall u. Int -> (forall d. Data d => d -> u) -> JS -> u)
-> (forall (m :: * -> *).
    Monad m =>
    (forall d. Data d => d -> m d) -> JS -> m JS)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> JS -> m JS)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> JS -> m JS)
-> Data JS
JS -> Constr
JS -> DataType
(forall b. Data b => b -> b) -> JS -> JS
forall a.
Typeable a =>
(forall (c :: * -> *).
 (forall d b. Data d => c (d -> b) -> d -> c b)
 -> (forall g. g -> c g) -> a -> c a)
-> (forall (c :: * -> *).
    (forall b r. Data b => c (b -> r) -> c r)
    -> (forall r. r -> c r) -> Constr -> c a)
-> (a -> Constr)
-> (a -> DataType)
-> (forall (t :: * -> *) (c :: * -> *).
    Typeable t =>
    (forall d. Data d => c (t d)) -> Maybe (c a))
-> (forall (t :: * -> * -> *) (c :: * -> *).
    Typeable t =>
    (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c a))
-> ((forall b. Data b => b -> b) -> a -> a)
-> (forall r r'.
    (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall r r'.
    (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall u. (forall d. Data d => d -> u) -> a -> [u])
-> (forall u. Int -> (forall d. Data d => d -> u) -> a -> u)
-> (forall (m :: * -> *).
    Monad m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> Data a
forall u. Int -> (forall d. Data d => d -> u) -> JS -> u
forall u. (forall d. Data d => d -> u) -> JS -> [u]
forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> JS -> r
forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> JS -> r
forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> JS -> m JS
forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> JS -> m JS
forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c JS
forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> JS -> c JS
forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c JS)
forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c JS)
$cgfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> JS -> c JS
gfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> JS -> c JS
$cgunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c JS
gunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c JS
$ctoConstr :: JS -> Constr
toConstr :: JS -> Constr
$cdataTypeOf :: JS -> DataType
dataTypeOf :: JS -> DataType
$cdataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c JS)
dataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c JS)
$cdataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c JS)
dataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c JS)
$cgmapT :: (forall b. Data b => b -> b) -> JS -> JS
gmapT :: (forall b. Data b => b -> b) -> JS -> JS
$cgmapQl :: forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> JS -> r
gmapQl :: forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> JS -> r
$cgmapQr :: forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> JS -> r
gmapQr :: forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> JS -> r
$cgmapQ :: forall u. (forall d. Data d => d -> u) -> JS -> [u]
gmapQ :: forall u. (forall d. Data d => d -> u) -> JS -> [u]
$cgmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> JS -> u
gmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> JS -> u
$cgmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> JS -> m JS
gmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> JS -> m JS
$cgmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> JS -> m JS
gmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> JS -> m JS
$cgmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> JS -> m JS
gmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> JS -> m JS
Data, (forall (m :: * -> *). Quote m => JS -> m Exp)
-> (forall (m :: * -> *). Quote m => JS -> Code m JS) -> Lift JS
forall t.
(forall (m :: * -> *). Quote m => t -> m Exp)
-> (forall (m :: * -> *). Quote m => t -> Code m t) -> Lift t
forall (m :: * -> *). Quote m => JS -> m Exp
forall (m :: * -> *). Quote m => JS -> Code m JS
$clift :: forall (m :: * -> *). Quote m => JS -> m Exp
lift :: forall (m :: * -> *). Quote m => JS -> m Exp
$cliftTyped :: forall (m :: * -> *). Quote m => JS -> Code m JS
liftTyped :: forall (m :: * -> *). Quote m => JS -> Code m JS
Lift)
  deriving newtype (JS -> JS -> Bool
(JS -> JS -> Bool) -> (JS -> JS -> Bool) -> Eq JS
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: JS -> JS -> Bool
== :: JS -> JS -> Bool
$c/= :: JS -> JS -> Bool
/= :: JS -> JS -> Bool
Eq, Eq JS
Eq JS =>
(JS -> JS -> Ordering)
-> (JS -> JS -> Bool)
-> (JS -> JS -> Bool)
-> (JS -> JS -> Bool)
-> (JS -> JS -> Bool)
-> (JS -> JS -> JS)
-> (JS -> JS -> JS)
-> Ord JS
JS -> JS -> Bool
JS -> JS -> Ordering
JS -> JS -> JS
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: JS -> JS -> Ordering
compare :: JS -> JS -> Ordering
$c< :: JS -> JS -> Bool
< :: JS -> JS -> Bool
$c<= :: JS -> JS -> Bool
<= :: JS -> JS -> Bool
$c> :: JS -> JS -> Bool
> :: JS -> JS -> Bool
$c>= :: JS -> JS -> Bool
>= :: JS -> JS -> Bool
$cmax :: JS -> JS -> JS
max :: JS -> JS -> JS
$cmin :: JS -> JS -> JS
min :: JS -> JS -> JS
Ord, Int -> JS -> ShowS
[JS] -> ShowS
JS -> [Char]
(Int -> JS -> ShowS)
-> (JS -> [Char]) -> ([JS] -> ShowS) -> Show JS
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> JS -> ShowS
showsPrec :: Int -> JS -> ShowS
$cshow :: JS -> [Char]
show :: JS -> [Char]
$cshowList :: [JS] -> ShowS
showList :: [JS] -> ShowS
Show, ReadPrec [JS]
ReadPrec JS
Int -> ReadS JS
ReadS [JS]
(Int -> ReadS JS)
-> ReadS [JS] -> ReadPrec JS -> ReadPrec [JS] -> Read JS
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
$creadsPrec :: Int -> ReadS JS
readsPrec :: Int -> ReadS JS
$creadList :: ReadS [JS]
readList :: ReadS [JS]
$creadPrec :: ReadPrec JS
readPrec :: ReadPrec JS
$creadListPrec :: ReadPrec [JS]
readListPrec :: ReadPrec [JS]
Read, Eq JS
Eq JS => (Int -> JS -> Int) -> (JS -> Int) -> Hashable JS
Int -> JS -> Int
JS -> Int
forall a. Eq a => (Int -> a -> Int) -> (a -> Int) -> Hashable a
$chashWithSalt :: Int -> JS -> Int
hashWithSalt :: Int -> JS -> Int
$chash :: JS -> Int
hash :: JS -> Int
Hashable, JS -> ()
(JS -> ()) -> NFData JS
forall a. (a -> ()) -> NFData a
$crnf :: JS -> ()
rnf :: JS -> ()
NFData, [Char] -> JS
([Char] -> JS) -> IsString JS
forall a. ([Char] -> a) -> IsString a
$cfromString :: [Char] -> JS
fromString :: [Char] -> JS
IsString)

instance Pack JS where
  pack :: JS -> Builder
pack = ByteString -> Builder
forall a. Pack a => a -> Builder
pack (ByteString -> Builder) -> (JS -> ByteString) -> JS -> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. JS -> ByteString
tocesu8
  {-# INLINE pack #-}

instance Unpack JS where
  unpack :: forall (st :: ZeroBitType) r. Parser st r JS
unpack = Parser st r Int
forall a (st :: ZeroBitType) r. Integral a => Parser st r a
unpackleb32 Parser st r Int -> (Int -> Parser st r Int) -> Parser st r Int
forall a b.
ParserT st r ParseError a
-> (a -> ParserT st r ParseError b) -> ParserT st r ParseError b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= [Char] -> Int -> Parser st r Int
forall a (st :: ZeroBitType) r.
(Num a, Ord a, Show a) =>
[Char] -> a -> Parser st r a
guardnat [Char]
"JavaString" Parser st r Int
-> (Int -> ParserT st r ParseError JS)
-> ParserT st r ParseError JS
forall a b.
ParserT st r ParseError a
-> (a -> ParserT st r ParseError b) -> ParserT st r ParseError b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (Int -> ParserT st r ParseError JS -> ParserT st r ParseError JS
forall (st :: ZeroBitType) r e a.
Int -> ParserT st r e a -> ParserT st r e a
`F.isolate` ParserT st r ParseError JS
forall (st :: ZeroBitType) r. Parser st r JS
fromcesu8p)
  {-# INLINE unpack #-}

-- | encode 'Text' into CESU-8 'ByteString'
textascesu8 :: Text -> ByteString
textascesu8 :: Text -> ByteString
textascesu8 = JS -> ByteString
tocesu8 (JS -> ByteString) -> (Text -> JS) -> Text -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> JS
JS
{-# INLINE textascesu8 #-}

-- | decode CESU-8 'ByteString' into 'Text'
cesu8astext :: ByteString -> Maybe Text
cesu8astext :: ByteString -> Maybe Text
cesu8astext ByteString
f = JS -> Text
getjs (JS -> Text) -> Maybe JS -> Maybe Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ByteString -> Maybe JS
fromcesu8 ByteString
f
{-# INLINE cesu8astext #-}

-- real meat

-- betwixt (between, inclusive)
bxt :: (Ord a) => a -> a -> a -> Bool
bxt :: forall a. Ord a => a -> a -> a -> Bool
bxt a
x a
a a
b = a
a a -> a -> Bool
forall a. Ord a => a -> a -> Bool
<= a
x Bool -> Bool -> Bool
&& a
x a -> a -> Bool
forall a. Ord a => a -> a -> Bool
<= a
b
{-# INLINE bxt #-}

-- | encode text to CESU-8
tocesu8 :: JS -> ByteString
tocesu8 :: JS -> ByteString
tocesu8 = LazyByteString -> ByteString
B.toStrict (LazyByteString -> ByteString)
-> (JS -> LazyByteString) -> JS -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> LazyByteString
toLazyByteString (Builder -> LazyByteString)
-> (JS -> Builder) -> JS -> LazyByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. JS -> Builder
tocesu8p
{-# INLINEABLE tocesu8 #-}

unp :: JS -> [Word8]
unp :: JS -> [Word8]
unp = ByteString -> [Word8]
B.unpack (ByteString -> [Word8]) -> (JS -> ByteString) -> JS -> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> ByteString
encodeUtf8 (Text -> ByteString) -> (JS -> Text) -> JS -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. JS -> Text
getjs
{-# INLINE unp #-}

-- | encode text to CESU-8 ('Builder' version)
tocesu8p :: JS -> Builder
tocesu8p :: JS -> Builder
tocesu8p JS
s0 =
  JS -> [Word8]
unp JS
s0 [Word8] -> ([Word8] -> Builder) -> Builder
forall a b. a -> (a -> b) -> b
& (([Word8] -> Builder) -> [Word8] -> Builder) -> [Word8] -> Builder
forall a. (a -> a) -> a
fix \[Word8] -> Builder
go -> \case
    [] -> Builder
forall a. Monoid a => a
mempty
    Word8
0 : [Word8]
xs -> Word8 -> Builder
word8 Word8
0xC0 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Word8 -> Builder
word8 Word8
0x80 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> [Word8] -> Builder
go [Word8]
xs
    Word8
x : [Word8]
xs | Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
x Word8
0x01 Word8
0x7F -> Word8 -> Builder
word8 Word8
x Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> [Word8] -> Builder
go [Word8]
xs
    Word8
x : Word8
y : [Word8]
xs
      | Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
x Word8
0xC0 Word8
0xDF Bool -> Bool -> Bool
&& Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
y Word8
0x80 Word8
0xBF ->
          Word8 -> Builder
word8 Word8
x Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Word8 -> Builder
word8 Word8
y Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> [Word8] -> Builder
go [Word8]
xs
    Word8
x : Word8
y : Word8
z : [Word8]
xs
      | Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
x Word8
0xE0 Word8
0xEF Bool -> Bool -> Bool
&& Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
y Word8
0x80 Word8
0xBF Bool -> Bool -> Bool
&& Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
z Word8
0x80 Word8
0xBF ->
          Word8 -> Builder
word8 Word8
x Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Word8 -> Builder
word8 Word8
y Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Word8 -> Builder
word8 Word8
z Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> [Word8] -> Builder
go [Word8]
xs
    Word8
x : Word8
y : Word8
z : Word8
w : [Word8]
xs
      | Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
x Word8
0xF0 Word8
0xF4
          Bool -> Bool -> Bool
&& Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
y Word8
0x80 Word8
0xBF
          Bool -> Bool -> Bool
&& Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
z Word8
0x80 Word8
0xBF
          Bool -> Bool -> Bool
&& Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
w Word8
0x80 Word8
0xBF ->
          -- the 4-byte case is special because CESU-8 encodes
          -- them as two UTF-16 surrogate pairs. this creates
          -- 6 bytes of CESU-8 data
          let fi :: (Integral a, Num b) => a -> b
              fi :: forall a b. (Integral a, Num b) => a -> b
fi = a -> b
forall a b. (Integral a, Num b) => a -> b
fromIntegral
              (Word32
h, Word32
l) =
                let co :: Word32
co =
                      Word32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
subtract Word32
0x10000 (Word32 -> Word32) -> Word32 -> Word32
forall a b. (a -> b) -> a -> b
$
                        ((Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fi Word8
x Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x07) Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
.<<. Int
18)
                          Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fi Word8
y Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x3F) Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
.<<. Int
12)
                          Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. ((Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fi Word8
z Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x3F) Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
.<<. Int
6)
                          Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. (Word8 -> Word32
forall a b. (Integral a, Num b) => a -> b
fi Word8
w Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x3F)
                 in (Word32
co Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
.>>. Int
10 Word32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
+ Word32
0xD800, Word32
co Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x3FF Word32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
+ Word32
0xDC00)
              w8 :: Word32 -> Builder
w8 = Word8 -> Builder
word8 (Word8 -> Builder) -> (Word32 -> Word8) -> Word32 -> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word32 -> Word8
forall a b. (Integral a, Num b) => a -> b
fi
              e :: Word32 -> Builder
e Word32
a =
                Word32 -> Builder
w8 ((Word32
a Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
.>>. Int
12) Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. (Word32
0xE0 :: Word32))
                  Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Word32 -> Builder
w8 (((Word32
a Word32 -> Int -> Word32
forall a. Bits a => a -> Int -> a
.>>. Int
6) Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x3F) Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. Word32
0x80)
                  Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Word32 -> Builder
w8 ((Word32
a Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.&. Word32
0x3F) Word32 -> Word32 -> Word32
forall a. Bits a => a -> a -> a
.|. Word32
0x80)
           in Word32 -> Builder
e Word32
h Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Word32 -> Builder
e Word32
l Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> [Word8] -> Builder
go [Word8]
xs
    -- only possible if the input is not a valid UTF-8 string
    Word8
x : [Word8]
_ -> [Char] -> Builder
forall a. HasCallStack => [Char] -> a
error ([Char] -> Builder) -> [Char] -> Builder
forall a b. (a -> b) -> a -> b
$ [Char] -> Word8 -> [Char]
forall r. PrintfType r => [Char] -> r
printf [Char]
"tocesu8p: unexpected character 0x%02x" Word8
x
{-# INLINEABLE tocesu8p #-}

-- | decode CESU-8 encoded text
fromcesu8 :: ByteString -> Maybe JS
fromcesu8 :: ByteString -> Maybe JS
fromcesu8 ByteString
s = case (forall (st :: ZeroBitType). Parser st () JS)
-> ByteString -> Result JS
forall a.
(forall (st :: ZeroBitType). Parser st () a)
-> ByteString -> Result a
parsepure0 (Parser st () JS
forall (st :: ZeroBitType) r. Parser st r JS
fromcesu8p Parser st () JS -> ParserT st () ParseError () -> Parser st () JS
forall a b.
ParserT st () ParseError a
-> ParserT st () ParseError b -> ParserT st () ParseError a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParserT st () ParseError ()
forall (st :: ZeroBitType) r e. ParserT st r e ()
F.eof) ByteString
s of
  F.OK JS
x Int
_ ByteString
_ -> JS -> Maybe JS
forall a. a -> Maybe a
Just JS
x
  Result JS
_ -> Maybe JS
forall a. Maybe a
Nothing

-- | decode CESU-8 encoded text ('Parser' version)
fromcesu8p :: Parser st r JS
fromcesu8p :: forall (st :: ZeroBitType) r. Parser st r JS
fromcesu8p =
  -- my condolences to the reader... this is a long one
  -- https://docs.oracle.com/en/java/javase/18/docs/api/java.base/java/io/DataInput.html#modified-utf-8
  -- CESU-8 -> UTF-16 -> Text
  ParserT st r ParseError [Word16]
-> ParserT st r ParseError [[Word16]]
forall a. ParserT st r ParseError a -> ParserT st r ParseError [a]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
F.many ParserT st r ParseError [Word16]
forall (st :: ZeroBitType) r. Parser st r [Word16]
cp ParserT st r ParseError [[Word16]]
-> ([[Word16]] -> Either UnicodeException Text)
-> ParserT st r ParseError (Either UnicodeException Text)
forall (f :: * -> *) a b. Functor f => f a -> (a -> b) -> f b
<&> [[Word16]] -> Either UnicodeException Text
de ParserT st r ParseError (Either UnicodeException Text)
-> (Either UnicodeException Text -> ParserT st r ParseError JS)
-> ParserT st r ParseError JS
forall a b.
ParserT st r ParseError a
-> (a -> ParserT st r ParseError b) -> ParserT st r ParseError b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
    Left (UnicodeException
e :: UnicodeException) -> [Char] -> ParserT st r ParseError JS
forall {st :: ZeroBitType} {r} {a}.
[Char] -> ParserT st r ParseError a
se ([Char] -> ParserT st r ParseError JS)
-> [Char] -> ParserT st r ParseError JS
forall a b. (a -> b) -> a -> b
$ [Char]
"fromcesu8p: " [Char] -> ShowS
forall a. Semigroup a => a -> a -> a
<> UnicodeException -> [Char]
forall a. Show a => a -> [Char]
show UnicodeException
e
    Right Text
t -> JS -> ParserT st r ParseError JS
forall a. a -> ParserT st r ParseError a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (JS -> ParserT st r ParseError JS)
-> JS -> ParserT st r ParseError JS
forall a b. (a -> b) -> a -> b
$ Text -> JS
JS Text
t
  where
    se :: [Char] -> ParserT st r ParseError a
se = ParseError -> ParserT st r ParseError a
forall e (st :: ZeroBitType) r a. e -> ParserT st r e a
F.err (ParseError -> ParserT st r ParseError a)
-> ([Char] -> ParseError) -> [Char] -> ParserT st r ParseError a
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> ParseError
ParseError
    -- de: decode CESU or error out
    de :: [[Word16]] -> Either UnicodeException Text
de =
      -- upon an invalid character decodeUtf16LEWith will use the given
      -- function to make a substitution. but we don't want that. we want
      -- to throw an error instead. "text" guidelines say, if that's the case,
      -- use 'throw' or 'error'. and we would like to catch the exception,
      -- so we temporarily enter the IO monad
      IO (Either UnicodeException Text) -> Either UnicodeException Text
forall a. IO a -> a
unsafeDupablePerformIO
        (IO (Either UnicodeException Text) -> Either UnicodeException Text)
-> ([[Word16]] -> IO (Either UnicodeException Text))
-> [[Word16]]
-> Either UnicodeException Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. IO Text -> IO (Either UnicodeException Text)
forall e a. Exception e => IO a -> IO (Either e a)
try
        (IO Text -> IO (Either UnicodeException Text))
-> ([[Word16]] -> IO Text)
-> [[Word16]]
-> IO (Either UnicodeException Text)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> IO Text
forall a. a -> IO a
evaluate
        (Text -> IO Text) -> ([[Word16]] -> Text) -> [[Word16]] -> IO Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. OnDecodeError -> ByteString -> Text
decodeUtf16LEWith ((UnicodeException -> Maybe Char
forall a e. Exception e => e -> a
throw .) ((Maybe Word8 -> UnicodeException) -> Maybe Word8 -> Maybe Char)
-> ([Char] -> Maybe Word8 -> UnicodeException) -> OnDecodeError
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> Maybe Word8 -> UnicodeException
DecodeError)
        (ByteString -> Text)
-> ([[Word16]] -> ByteString) -> [[Word16]] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Word8] -> ByteString
B.pack
        ([Word8] -> ByteString)
-> ([[Word16]] -> [Word8]) -> [[Word16]] -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word16 -> [Word8]) -> [Word16] -> [Word8]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Word16 -> [Word8]
forall {l} {a}.
(IsList l, Integral a, Bits a, Num (Item l)) =>
a -> l
by
        ([Word16] -> [Word8])
-> ([[Word16]] -> [Word16]) -> [[Word16]] -> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[Word16]] -> [Word16]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat
    -- by: split a 16-bit number into two 8-bit numbers
    by :: a -> l
by a
x = [a -> Item l
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
x, a -> Item l
forall a b. (Integral a, Num b) => a -> b
fromIntegral (a
x a -> Int -> a
forall a. Bits a => a -> Int -> a
.>>. Int
8)]
    {-# INLINE by #-}
    -- an: extract 6-bit continuation from a CESU-8 byte
    -- the symbol "an" comes from "and" (.&.)
    an :: ParserT st r ParseError Word16
an = forall a (st :: ZeroBitType) r. Unpack a => Parser st r a
unpack @Word8 Parser st r Word8
-> (Word8 -> Word16) -> ParserT st r ParseError Word16
forall (f :: * -> *) a b. Functor f => f a -> (a -> b) -> f b
<&> \Word8
x -> Word8 -> Word16
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
x Word16 -> Word16 -> Word16
forall a. Bits a => a -> a -> a
.&. (Word16
0x3F :: Word16)
    -- sh: shift and combine
    sh :: a -> a -> Int -> a -> a
sh a
a a
b Int
c = (a -> Int -> a
forall a. Bits a => a -> Int -> a
shift (a -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
a a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
b) Int
c .|.)
    -- pu: pure . fromIntegral
    pu :: (Integral a, Num b, Monad m) => [a] -> m [b]
    pu :: forall a b (m :: * -> *).
(Integral a, Num b, Monad m) =>
[a] -> m [b]
pu = [b] -> m [b]
forall a. a -> m a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([b] -> m [b]) -> ([a] -> [b]) -> [a] -> m [b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a -> b) -> [a] -> [b]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap a -> b
forall a b. (Integral a, Num b) => a -> b
fromIntegral
    -- cp: decode a UTF-16 surrogate pair from CESU-8
    cp :: Parser st r [Word16]
    cp :: forall (st :: ZeroBitType) r. Parser st r [Word16]
cp =
      forall a (st :: ZeroBitType) r. Unpack a => Parser st r a
unpack @Word8 Parser st r Word8
-> (Word8 -> ParserT st r ParseError [Word16])
-> ParserT st r ParseError [Word16]
forall a b.
ParserT st r ParseError a
-> (a -> ParserT st r ParseError b) -> ParserT st r ParseError b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
        Word8
x | Word8
x Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0xC0 -> do
          forall a (st :: ZeroBitType) r. Unpack a => Parser st r a
unpack @Word8 Parser st r Word8
-> (Word8 -> ParserT st r ParseError [Word16])
-> ParserT st r ParseError [Word16]
forall a b.
ParserT st r ParseError a
-> (a -> ParserT st r ParseError b) -> ParserT st r ParseError b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
            Word8
0x80 -> [Word16] -> ParserT st r ParseError [Word16]
forall a b (m :: * -> *).
(Integral a, Num b, Monad m) =>
[a] -> m [b]
pu [Word16
0 :: Word16] -- 2 bytes; null
            Word8
y -> [Char] -> ParserT st r ParseError [Word16]
forall {st :: ZeroBitType} {r} {a}.
[Char] -> ParserT st r ParseError a
se ([Char] -> ParserT st r ParseError [Word16])
-> [Char] -> ParserT st r ParseError [Word16]
forall a b. (a -> b) -> a -> b
$ [Char] -> Word8 -> [Char]
forall r. PrintfType r => [Char] -> r
printf [Char]
"fromcesu8p: unexpected CESU-8 byte 0x%02x" Word8
y
        Word8
x | Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
x Word8
0x01 Word8
0x7F -> [Word8] -> ParserT st r ParseError [Word16]
forall a b (m :: * -> *).
(Integral a, Num b, Monad m) =>
[a] -> m [b]
pu ([Word8] -> ParserT st r ParseError [Word16])
-> (Word8 -> [Word8]) -> Word8 -> ParserT st r ParseError [Word16]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> [Word8]
forall a. a -> [a]
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Word8 -> ParserT st r ParseError [Word16])
-> Word8 -> ParserT st r ParseError [Word16]
forall a b. (a -> b) -> a -> b
$ Word8
x -- 1 byte; direct
        Word8
x | Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
x Word8
0xC0 Word8
0xDF -> ParserT st r ParseError Word16
forall {st :: ZeroBitType} {r}. ParserT st r ParseError Word16
an ParserT st r ParseError Word16
-> (Word16 -> [Word16]) -> ParserT st r ParseError [Word16]
forall (f :: * -> *) a b. Functor f => f a -> (a -> b) -> f b
<&> Word16 -> [Word16]
forall a. a -> [a]
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Word16 -> [Word16]) -> (Word16 -> Word16) -> Word16 -> [Word16]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Word8 -> Word16 -> Int -> Word16 -> Word16
forall {a} {a}.
(Bits a, Integral a, Num a) =>
a -> a -> Int -> a -> a
sh Word8
x Word16
0x1F Int
6 -- 2 bytes
        Word8
x | Word8
x Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0xED -> do
          -- 6 bytes; come in surrogate pairs
          Word16
y <- ParserT st r ParseError Word16
forall {st :: ZeroBitType} {r}. ParserT st r ParseError Word16
an
          Word16
z <- ParserT st r ParseError Word16
forall {st :: ZeroBitType} {r}. ParserT st r ParseError Word16
an
          if Word16 -> Word16 -> Word16 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word16
y Word16
0x20 Word16
0x2F
            then do
              Word8
x2 <- forall a (st :: ZeroBitType) r. Unpack a => Parser st r a
unpack @Word8
              if Word8
x2 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0xED
                then do
                  Word16
y2 <- ParserT st r ParseError Word16
forall {st :: ZeroBitType} {r}. ParserT st r ParseError Word16
an
                  Word16
z2 <- ParserT st r ParseError Word16
forall {st :: ZeroBitType} {r}. ParserT st r ParseError Word16
an
                  if Word16 -> Word16 -> Word16 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word16
y2 Word16
0x30 Word16
0x3F
                    then
                      let high :: Word16
high = Word8 -> Word16 -> Int -> Word16 -> Word16
forall {a} {a}.
(Bits a, Integral a, Num a) =>
a -> a -> Int -> a -> a
sh Word8
x Word16
0x0F Int
12 (Word16 -> Int -> Word16
forall a. Bits a => a -> Int -> a
shift Word16
y Int
6 Word16 -> Word16 -> Word16
forall a. Bits a => a -> a -> a
.|. Word16
z)
                          low :: Word16
low = Word8 -> Word16 -> Int -> Word16 -> Word16
forall {a} {a}.
(Bits a, Integral a, Num a) =>
a -> a -> Int -> a -> a
sh Word8
x2 Word16
0x0F Int
12 (Word16 -> Int -> Word16
forall a. Bits a => a -> Int -> a
shift Word16
y2 Int
6 Word16 -> Word16 -> Word16
forall a. Bits a => a -> a -> a
.|. Word16
z2)
                       in [Word16] -> ParserT st r ParseError [Word16]
forall a b (m :: * -> *).
(Integral a, Num b, Monad m) =>
[a] -> m [b]
pu [Word16
Item [Word16]
high, Word16
Item [Word16]
low]
                    else
                      [Char] -> ParserT st r ParseError [Word16]
forall {st :: ZeroBitType} {r} {a}.
[Char] -> ParserT st r ParseError a
se ([Char] -> ParserT st r ParseError [Word16])
-> [Char] -> ParserT st r ParseError [Word16]
forall a b. (a -> b) -> a -> b
$
                        [Char] -> Word16 -> [Char]
forall r. PrintfType r => [Char] -> r
printf
                          [Char]
"fromcesu8p: invalid low surrogate ... \
                          \0x%02x is not in [0x30, 0x3F]"
                          Word16
y2
                else
                  [Char] -> ParserT st r ParseError [Word16]
forall {st :: ZeroBitType} {r} {a}.
[Char] -> ParserT st r ParseError a
se ([Char] -> ParserT st r ParseError [Word16])
-> [Char] -> ParserT st r ParseError [Word16]
forall a b. (a -> b) -> a -> b
$
                    [Char] -> Word8 -> [Char]
forall r. PrintfType r => [Char] -> r
printf
                      [Char]
"fromcesu8p: invalid surrogate pair ... \
                      \expected 0xED, got 0x%02x"
                      Word8
x2
            else [Word16] -> ParserT st r ParseError [Word16]
forall a b (m :: * -> *).
(Integral a, Num b, Monad m) =>
[a] -> m [b]
pu ([Word16] -> ParserT st r ParseError [Word16])
-> [Word16] -> ParserT st r ParseError [Word16]
forall a b. (a -> b) -> a -> b
$ Word16 -> [Word16]
forall a. a -> [a]
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Word16 -> [Word16]) -> Word16 -> [Word16]
forall a b. (a -> b) -> a -> b
$ Word8 -> Word16 -> Int -> Word16 -> Word16
forall {a} {a}.
(Bits a, Integral a, Num a) =>
a -> a -> Int -> a -> a
sh Word8
x Word16
0x0F Int
12 (Word16 -> Int -> Word16
forall a. Bits a => a -> Int -> a
shift Word16
y Int
6 Word16 -> Word16 -> Word16
forall a. Bits a => a -> a -> a
.|. Word16
z)
        Word8
x -- 3 bytes (exclude 0xED; yeah, spec is weird)
          | Word8 -> Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> a -> Bool
bxt Word8
x Word8
0xE0 Word8
0xEF ->
              ParserT st r ParseError Word16
forall {st :: ZeroBitType} {r}. ParserT st r ParseError Word16
an ParserT st r ParseError Word16
-> (Word16 -> ParserT st r ParseError [Word16])
-> ParserT st r ParseError [Word16]
forall a b.
ParserT st r ParseError a
-> (a -> ParserT st r ParseError b) -> ParserT st r ParseError b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \Word16
p ->
                ParserT st r ParseError Word16
forall {st :: ZeroBitType} {r}. ParserT st r ParseError Word16
an ParserT st r ParseError Word16
-> (Word16 -> [Word16]) -> ParserT st r ParseError [Word16]
forall (f :: * -> *) a b. Functor f => f a -> (a -> b) -> f b
<&> \Word16
q ->
                  Word16 -> [Word16]
forall a. a -> [a]
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Word16 -> [Word16]) -> Word16 -> [Word16]
forall a b. (a -> b) -> a -> b
$ Word8 -> Word16 -> Int -> Word16 -> Word16
forall {a} {a}.
(Bits a, Integral a, Num a) =>
a -> a -> Int -> a -> a
sh Word8
x Word16
0x0F Int
12 (Word16 -> Int -> Word16
forall a. Bits a => a -> Int -> a
shift Word16
p Int
6 Word16 -> Word16 -> Word16
forall a. Bits a => a -> a -> a
.|. Word16
q)
        Word8
x -> [Char] -> ParserT st r ParseError [Word16]
forall {st :: ZeroBitType} {r} {a}.
[Char] -> ParserT st r ParseError a
se ([Char] -> ParserT st r ParseError [Word16])
-> [Char] -> ParserT st r ParseError [Word16]
forall a b. (a -> b) -> a -> b
$ [Char] -> Word8 -> [Char]
forall r. PrintfType r => [Char] -> r
printf [Char]
"fromcesu8p: unexpected CESU-8 byte 0x%02x" Word8
x
    {-# INLINE cp #-}
{-# INLINEABLE fromcesu8p #-}