{-# LANGUAGE CPP                   #-}
{-# LANGUAGE OverloadedStrings     #-}
{-# LANGUAGE FlexibleInstances     #-}
{-# LANGUAGE MultiParamTypeClasses #-}

module Text.XmlHtml.TextParser
( guessEncoding
, parse
, isValidChar
, parseText
, takeWhile0
, takeWhile1
, text
, scanText
, ScanState(..)

, module Text.Parsec.Text
) where

#if !MIN_VERSION_base(4,8,0)
import           Control.Applicative
#endif
import           Data.Char
import           Data.Maybe
import           Text.XmlHtml.Common

import           Data.Text (Text)
import qualified Data.Text as T

import qualified Text.Parsec as P
import           Text.Parsec.Text

import           Data.ByteString (ByteString)
import qualified Data.ByteString as B

------------------------------------------------------------------------------
-- | Get an initial guess at document encoding from the byte order mark.  If
-- the mark doesn't exist, guess UTF-8.  Otherwise, guess according to the
-- mark.
guessEncoding :: ByteString -> (Encoding, ByteString)
guessEncoding :: ByteString -> (Encoding, ByteString)
guessEncoding b :: ByteString
b
    | Int -> ByteString -> ByteString
B.take 3 ByteString
b ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== [Word8] -> ByteString
B.pack [ 0xEF, 0xBB, 0xBF ] = (Encoding
UTF8,    Int -> ByteString -> ByteString
B.drop 3 ByteString
b)
    | Int -> ByteString -> ByteString
B.take 2 ByteString
b ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== [Word8] -> ByteString
B.pack [ 0xFE, 0xFF ]       = (Encoding
UTF16BE, Int -> ByteString -> ByteString
B.drop 2 ByteString
b)
    | Int -> ByteString -> ByteString
B.take 2 ByteString
b ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== [Word8] -> ByteString
B.pack [ 0xFF, 0xFE ]       = (Encoding
UTF16LE, Int -> ByteString -> ByteString
B.drop 2 ByteString
b)
    | Bool
otherwise                                 = (Encoding
UTF8,    ByteString
b)

------------------------------------------------------------------------------
parse :: (Encoding -> Parser a) -> String -> ByteString -> Either String a
parse :: (Encoding -> Parser a) -> String -> ByteString -> Either String a
parse p :: Encoding -> Parser a
p src :: String
src b :: ByteString
b = let (e :: Encoding
e, b' :: ByteString
b') = ByteString -> (Encoding, ByteString)
guessEncoding ByteString
b
                    t :: Text
t       = Encoding -> ByteString -> Text
decoder Encoding
e ByteString
b'
                    bad :: Maybe Char
bad     = (Char -> Bool) -> Text -> Maybe Char
T.find (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isValidChar) Text
t
                in  if Maybe Char -> Bool
forall a. Maybe a -> Bool
isNothing Maybe Char
bad
                        then Parser a -> String -> Text -> Either String a
forall a. Parser a -> String -> Text -> Either String a
parseText (Encoding -> Parser a
p Encoding
e Parser a -> ParsecT Text () Identity () -> Parser a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParsecT Text () Identity ()
forall s (m :: * -> *) t u.
(Stream s m t, Show t) =>
ParsecT s u m ()
P.eof) String
src Text
t
                        else String -> Either String a
forall a b. a -> Either a b
Left (String -> Either String a) -> String -> Either String a
forall a b. (a -> b) -> a -> b
$ "Document contains invalid character:"
                                 String -> String -> String
forall a. [a] -> [a] -> [a]
++ " \\" String -> String -> String
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show (Char -> Int
ord (Maybe Char -> Char
forall a. HasCallStack => Maybe a -> a
fromJust Maybe Char
bad))


------------------------------------------------------------------------------
-- | Checks if a document contains invalid characters.
--
isValidChar :: Char -> Bool
isValidChar :: Char -> Bool
isValidChar c :: Char
c | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
< '\x9'                     = Bool
False
              | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
> '\xA'    Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
< '\xD'     = Bool
False
              | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
> '\xD'    Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
< '\x20'    = Bool
False
              | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
> '\xD7FF' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
< '\xE000'  = Bool
False
              | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
> '\xFFFD' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
< '\x10000' = Bool
False
              | Bool
otherwise                     = Bool
True


------------------------------------------------------------------------------
-- | Parses a 'Text' value and gives back the result.  The parser is expected
-- to match the entire string.
parseText :: Parser a         -- ^ The parser to match
          -> String           -- ^ Name of the source file (can be @\"\"@)
          -> Text             -- ^ Text to parse
          -> Either String a  -- Either an error message or the result
parseText :: Parser a -> String -> Text -> Either String a
parseText p :: Parser a
p src :: String
src t :: Text
t = (ParseError -> String) -> Either ParseError a -> Either String a
forall a b c. (a -> b) -> Either a c -> Either b c
inLeft ParseError -> String
forall a. Show a => a -> String
show (Parser a -> String -> Text -> Either ParseError a
forall s t a.
Stream s Identity t =>
Parsec s () a -> String -> s -> Either ParseError a
P.parse Parser a
p String
src Text
t)
  where inLeft :: (a -> b) -> Either a c -> Either b c
        inLeft :: (a -> b) -> Either a c -> Either b c
inLeft f :: a -> b
f (Left x :: a
x)  = b -> Either b c
forall a b. a -> Either a b
Left (a -> b
f a
x)
        inLeft _ (Right x :: c
x) = c -> Either b c
forall a b. b -> Either a b
Right c
x


------------------------------------------------------------------------------
-- | Consume input as long as the predicate returns 'True', and return the
-- consumed input.  This parser does not fail.  If it matches no input, it
-- will return an empty string.
takeWhile0 :: (Char -> Bool) -> Parser Text
takeWhile0 :: (Char -> Bool) -> Parser Text
takeWhile0 p :: Char -> Bool
p = (String -> Text) -> ParsecT Text () Identity String -> Parser Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> Text
T.pack (ParsecT Text () Identity String -> Parser Text)
-> ParsecT Text () Identity String -> Parser Text
forall a b. (a -> b) -> a -> b
$ ParsecT Text () Identity Char -> ParsecT Text () Identity String
forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m [a]
P.many (ParsecT Text () Identity Char -> ParsecT Text () Identity String)
-> ParsecT Text () Identity Char -> ParsecT Text () Identity String
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy Char -> Bool
p


------------------------------------------------------------------------------
-- | Consume input as long as the predicate returns 'True', and return the
-- consumed input.  This parser requires the predicate to succeed on at least
-- one character of input.  It will fail if the first character fails the
-- predicate.
takeWhile1 :: (Char -> Bool) -> Parser Text
takeWhile1 :: (Char -> Bool) -> Parser Text
takeWhile1 p :: Char -> Bool
p = (String -> Text) -> ParsecT Text () Identity String -> Parser Text
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> Text
T.pack (ParsecT Text () Identity String -> Parser Text)
-> ParsecT Text () Identity String -> Parser Text
forall a b. (a -> b) -> a -> b
$ ParsecT Text () Identity Char -> ParsecT Text () Identity String
forall s (m :: * -> *) t u a.
Stream s m t =>
ParsecT s u m a -> ParsecT s u m [a]
P.many1 (ParsecT Text () Identity Char -> ParsecT Text () Identity String)
-> ParsecT Text () Identity Char -> ParsecT Text () Identity String
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> ParsecT Text () Identity Char
forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy Char -> Bool
p


------------------------------------------------------------------------------
-- | The equivalent of Parsec's string combinator, but for text.  If there is
-- not a complete match, then no input is consumed.  This matches the behavior
-- of @string@ from the attoparsec-text package.
text :: Text -> Parser Text
text :: Text -> Parser Text
text t :: Text
t = Parser Text -> Parser Text
forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try (Parser Text -> Parser Text) -> Parser Text -> Parser Text
forall a b. (a -> b) -> a -> b
$ String -> ParsecT Text () Identity String
forall s (m :: * -> *) u.
Stream s m Char =>
String -> ParsecT s u m String
P.string (Text -> String
T.unpack Text
t) ParsecT Text () Identity String -> Parser Text -> Parser Text
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Text -> Parser Text
forall (m :: * -> *) a. Monad m => a -> m a
return Text
t


------------------------------------------------------------------------------
-- | Represents the state of a text scanner, for use with the 'scanText'
-- parser combinator.
data ScanState = ScanNext (Char -> ScanState)
               | ScanFinish
               | ScanFail String


------------------------------------------------------------------------------
-- | Scans text and progresses through a DFA, collecting the complete matching
-- text as it goes.
scanText :: (Char -> ScanState) -> Parser String
scanText :: (Char -> ScanState) -> ParsecT Text () Identity String
scanText f :: Char -> ScanState
f = do
    ParsecT Text () Identity String -> ParsecT Text () Identity String
forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try (ParsecT Text () Identity String
 -> ParsecT Text () Identity String)
-> ParsecT Text () Identity String
-> ParsecT Text () Identity String
forall a b. (a -> b) -> a -> b
$ do
        Char
c <- ParsecT Text () Identity Char
forall s (m :: * -> *) u. Stream s m Char => ParsecT s u m Char
P.anyChar
        case Char -> ScanState
f Char
c of
            ScanNext f' :: Char -> ScanState
f'  -> (Char
cChar -> String -> String
forall a. a -> [a] -> [a]
:) (String -> String)
-> ParsecT Text () Identity String
-> ParsecT Text () Identity String
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` (Char -> ScanState) -> ParsecT Text () Identity String
scanText Char -> ScanState
f'
            ScanFinish   -> String -> ParsecT Text () Identity String
forall (m :: * -> *) a. Monad m => a -> m a
return [Char
c]
            ScanFail err :: String
err -> String -> ParsecT Text () Identity String
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
err