{- |
Copyright : (c) 2024 Pierre Le Marre
Maintainer: [email protected]
Stability : experimental

Miscellaneous bits common to various parsers
-}
module Unicode.CharacterDatabase.Parser.Internal (
  -- * Word8 patterns
  pattern Asterisk,
  pattern Comma,
  pattern HashTag,
  pattern NewLine,
  pattern Period,
  pattern SemiColon,
  pattern Slash,

  -- * Parser helpers
  withParser,

  -- * Code point
  parseCodePoint,
  parseCodePointM,

  -- * Range
  CodePointRange (..),
  parseCodePointRange,
  parseCodePointRange',

  -- * Numeric value
  NumericValue (..),
  parseNumericValue,

  -- * Boolean value
  parseBoolValue,
) where

import Data.ByteString qualified as B
import Data.ByteString.Char8 qualified as B8
import Data.Char (chr)
import Data.Maybe (fromMaybe)
import Data.Ratio ((%))
import Data.Word (Word8)
import GHC.Stack (HasCallStack)

--------------------------------------------------------------------------------
-- Char8 patterns
--------------------------------------------------------------------------------

-- | @'\\n'@
pattern NewLine  Word8
pattern $mNewLine :: forall {r}. Word8 -> ((# #) -> r) -> ((# #) -> r) -> r
$bNewLine :: Word8
NewLine = 0x0a

-- | @#@
pattern HashTag  Word8
pattern $mHashTag :: forall {r}. Word8 -> ((# #) -> r) -> ((# #) -> r) -> r
$bHashTag :: Word8
HashTag = 0x23

-- | @*@
pattern Asterisk  Word8
pattern $mAsterisk :: forall {r}. Word8 -> ((# #) -> r) -> ((# #) -> r) -> r
$bAsterisk :: Word8
Asterisk = 0x2a

-- | @,@
pattern Comma  Word8
pattern $mComma :: forall {r}. Word8 -> ((# #) -> r) -> ((# #) -> r) -> r
$bComma :: Word8
Comma = 0x2c

-- | @.@
pattern Period  Word8
pattern $mPeriod :: forall {r}. Word8 -> ((# #) -> r) -> ((# #) -> r) -> r
$bPeriod :: Word8
Period = 0x2e

-- | @\/@
pattern Slash  Word8
pattern $mSlash :: forall {r}. Word8 -> ((# #) -> r) -> ((# #) -> r) -> r
$bSlash :: Word8
Slash = 0x2f

-- | @;@
pattern SemiColon  Word8
pattern $mSemiColon :: forall {r}. Word8 -> ((# #) -> r) -> ((# #) -> r) -> r
$bSemiColon :: Word8
SemiColon = 0x3b

--------------------------------------------------------------------------------
-- Parse helpers
--------------------------------------------------------------------------------

-- | Use the given parser to parse each line
withParser  (HasCallStack)  (B.ByteString  Maybe a)  B.ByteString  Maybe (a, B.ByteString)
withParser :: forall a.
HasCallStack =>
(ByteString -> Maybe a) -> ByteString -> Maybe (a, ByteString)
withParser ByteString -> Maybe a
parse = ByteString -> Maybe (a, ByteString)
go (ByteString -> Maybe (a, ByteString))
-> (ByteString -> ByteString)
-> ByteString
-> Maybe (a, ByteString)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
skipUtf8BOM
 where
  go :: ByteString -> Maybe (a, ByteString)
go ByteString
raw
    | ByteString -> Bool
B.null ByteString
raw = Maybe (a, ByteString)
forall a. Maybe a
Nothing
    | Bool
otherwise = case (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
NewLine) ByteString
raw of
        (ByteString -> ByteString
B8.strip  ByteString
line, Int -> ByteString -> ByteString
B.drop Int
1  ByteString
raw') 
          case ByteString -> Maybe a
parse ByteString
line of
            Maybe a
Nothing  ByteString -> Maybe (a, ByteString)
go ByteString
raw'
            Just a
entry  (a, ByteString) -> Maybe (a, ByteString)
forall a. a -> Maybe a
Just (a
entry, ByteString
raw')

skipUtf8BOM  B.ByteString  B.ByteString
skipUtf8BOM :: ByteString -> ByteString
skipUtf8BOM ByteString
raw = ByteString -> Maybe ByteString -> ByteString
forall a. a -> Maybe a -> a
fromMaybe ByteString
raw (ByteString -> ByteString -> Maybe ByteString
B.stripPrefix ByteString
"\xEF\xBB\xBF" ByteString
raw)

--------------------------------------------------------------------------------
-- Code point parser
--------------------------------------------------------------------------------

{- | Parse a code point formatted as hexadecimal

/Warning:/ raise an error on invalid input.

>>> parseCodePoint "0061"
'a'

@since 0.1.0
-}
parseCodePoint  (HasCallStack)  B.ByteString  Char
parseCodePoint :: HasCallStack => ByteString -> Char
parseCodePoint = Int -> Char
chr (Int -> Char) -> (ByteString -> Int) -> ByteString -> Char
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Int
forall a. Read a => String -> a
read (String -> Int) -> (ByteString -> String) -> ByteString -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> String
B8.unpack (ByteString -> String)
-> (ByteString -> ByteString) -> ByteString -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString
"0x" <>)

{- | Parse a code point formatted as hexadecimal, or return 'Nothing' on an
empty string.

/Warning:/ raise an error on invalid input.

>>> parseCodePointM "0061"
Just 'a'
>>> parseCodePointM ""
Nothing

See also: 'parseCodePoint'.

@since 0.1.0
-}
parseCodePointM  (HasCallStack)  B.ByteString  Maybe Char
parseCodePointM :: HasCallStack => ByteString -> Maybe Char
parseCodePointM ByteString
raw
  | ByteString -> Bool
B.null ByteString
raw = Maybe Char
forall a. Maybe a
Nothing
  | Bool
otherwise = Char -> Maybe Char
forall a. a -> Maybe a
Just (HasCallStack => ByteString -> Char
ByteString -> Char
parseCodePoint ByteString
raw)

--------------------------------------------------------------------------------
-- Code point range parser
--------------------------------------------------------------------------------

{- | A Unicode code point range

@since 0.1.0
-}
data CodePointRange
  = SingleChar {CodePointRange -> Char
start  !Char}
  | CharRange {start  !Char, CodePointRange -> Char
end  !Char}
  deriving (CodePointRange -> CodePointRange -> Bool
(CodePointRange -> CodePointRange -> Bool)
-> (CodePointRange -> CodePointRange -> Bool) -> Eq CodePointRange
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: CodePointRange -> CodePointRange -> Bool
== :: CodePointRange -> CodePointRange -> Bool
$c/= :: CodePointRange -> CodePointRange -> Bool
/= :: CodePointRange -> CodePointRange -> Bool
Eq, Eq CodePointRange
Eq CodePointRange =>
(CodePointRange -> CodePointRange -> Ordering)
-> (CodePointRange -> CodePointRange -> Bool)
-> (CodePointRange -> CodePointRange -> Bool)
-> (CodePointRange -> CodePointRange -> Bool)
-> (CodePointRange -> CodePointRange -> Bool)
-> (CodePointRange -> CodePointRange -> CodePointRange)
-> (CodePointRange -> CodePointRange -> CodePointRange)
-> Ord CodePointRange
CodePointRange -> CodePointRange -> Bool
CodePointRange -> CodePointRange -> Ordering
CodePointRange -> CodePointRange -> CodePointRange
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: CodePointRange -> CodePointRange -> Ordering
compare :: CodePointRange -> CodePointRange -> Ordering
$c< :: CodePointRange -> CodePointRange -> Bool
< :: CodePointRange -> CodePointRange -> Bool
$c<= :: CodePointRange -> CodePointRange -> Bool
<= :: CodePointRange -> CodePointRange -> Bool
$c> :: CodePointRange -> CodePointRange -> Bool
> :: CodePointRange -> CodePointRange -> Bool
$c>= :: CodePointRange -> CodePointRange -> Bool
>= :: CodePointRange -> CodePointRange -> Bool
$cmax :: CodePointRange -> CodePointRange -> CodePointRange
max :: CodePointRange -> CodePointRange -> CodePointRange
$cmin :: CodePointRange -> CodePointRange -> CodePointRange
min :: CodePointRange -> CodePointRange -> CodePointRange
Ord, Int -> CodePointRange -> ShowS
[CodePointRange] -> ShowS
CodePointRange -> String
(Int -> CodePointRange -> ShowS)
-> (CodePointRange -> String)
-> ([CodePointRange] -> ShowS)
-> Show CodePointRange
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> CodePointRange -> ShowS
showsPrec :: Int -> CodePointRange -> ShowS
$cshow :: CodePointRange -> String
show :: CodePointRange -> String
$cshowList :: [CodePointRange] -> ShowS
showList :: [CodePointRange] -> ShowS
Show)

{- | Parse @AAAA..BBBB@ range or single code point

@since 0.1.0
-}
parseCodePointRange  (HasCallStack)  B.ByteString  CodePointRange
parseCodePointRange :: HasCallStack => ByteString -> CodePointRange
parseCodePointRange ByteString
raw = case (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
Period) ByteString
raw of
  (HasCallStack => ByteString -> Char
ByteString -> Char
parseCodePoint  Char
ch1, ByteString
rest)
    | ByteString -> Bool
B.null ByteString
rest  Char -> CodePointRange
SingleChar Char
ch1
    | Bool
otherwise  Char -> Char -> CodePointRange
CharRange Char
ch1 (HasCallStack => ByteString -> Char
ByteString -> Char
parseCodePoint (Int -> ByteString -> ByteString
B.drop Int
2 ByteString
rest))

{- | Parse @AAAA..BBBB@ range

@since 0.1.0
-}
parseCodePointRange'  (HasCallStack)  B.ByteString  (Char, Char)
parseCodePointRange' :: HasCallStack => ByteString -> (Char, Char)
parseCodePointRange' ByteString
raw = case (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
Period) ByteString
raw of
  (HasCallStack => ByteString -> Char
ByteString -> Char
parseCodePoint  Char
ch1, ByteString
rest)  (Char
ch1, HasCallStack => ByteString -> Char
ByteString -> Char
parseCodePoint (Int -> ByteString -> ByteString
B.drop Int
2 ByteString
rest))

--------------------------------------------------------------------------------
-- Numeric value parser
--------------------------------------------------------------------------------

{- | Numeric value of a code point, if relevant

@since 0.1.0
-}
data NumericValue
  = NotNumeric
  | Integer !Integer
  | Rational !Rational
  deriving (NumericValue -> NumericValue -> Bool
(NumericValue -> NumericValue -> Bool)
-> (NumericValue -> NumericValue -> Bool) -> Eq NumericValue
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: NumericValue -> NumericValue -> Bool
== :: NumericValue -> NumericValue -> Bool
$c/= :: NumericValue -> NumericValue -> Bool
/= :: NumericValue -> NumericValue -> Bool
Eq, Int -> NumericValue -> ShowS
[NumericValue] -> ShowS
NumericValue -> String
(Int -> NumericValue -> ShowS)
-> (NumericValue -> String)
-> ([NumericValue] -> ShowS)
-> Show NumericValue
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> NumericValue -> ShowS
showsPrec :: Int -> NumericValue -> ShowS
$cshow :: NumericValue -> String
show :: NumericValue -> String
$cshowList :: [NumericValue] -> ShowS
showList :: [NumericValue] -> ShowS
Show)

-- | See: https://www.unicode.org/reports/tr44/#Numeric_Value
parseNumericValue  (HasCallStack)  B.ByteString  NumericValue
parseNumericValue :: HasCallStack => ByteString -> NumericValue
parseNumericValue ByteString
raw
  | ByteString -> Bool
B.null ByteString
raw = NumericValue
NotNumeric
  | Word8 -> ByteString -> Bool
B.elem Word8
Slash ByteString
raw = case (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
Slash) ByteString
raw of
      (ByteString
num, ByteString
denum)  Rational -> NumericValue
Rational (ByteString -> Integer
readB ByteString
num Integer -> Integer -> Rational
forall a. Integral a => a -> a -> Ratio a
% (ByteString -> Integer
readB (ByteString -> Integer)
-> (ByteString -> ByteString) -> ByteString -> Integer
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> ByteString -> ByteString
B.drop Int
1) ByteString
denum)
       where
        readB :: ByteString -> Integer
readB = String -> Integer
forall a. Read a => String -> a
read (String -> Integer)
-> (ByteString -> String) -> ByteString -> Integer
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> String
B8.unpack
  | Bool
otherwise = Integer -> NumericValue
Integer (String -> Integer
forall a. Read a => String -> a
read (ByteString -> String
B8.unpack ByteString
raw))

--------------------------------------------------------------------------------
-- Boolean value parser
--------------------------------------------------------------------------------

-- | Parse boolean values ‘Y’ and ‘N’.
parseBoolValue  (HasCallStack)  B.ByteString  Bool
parseBoolValue :: HasCallStack => ByteString -> Bool
parseBoolValue = \case
  ByteString
"Y"  Bool
True
  ByteString
"N"  Bool
False
  ByteString
raw  String -> Bool
forall a. HasCallStack => String -> a
error (String
"parseBoolValue: Cannot parse: " String -> ShowS
forall a. Semigroup a => a -> a -> a
<> ByteString -> String
forall a. Show a => a -> String
show ByteString
raw)