{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module Text.XmlHtml.TextParser
( guessEncoding
, parse
, isValidChar
, parseText
, takeWhile0
, takeWhile1
, text
, scanText
, ScanState(..)
, module Text.Parsec.Text
) where
#if !MIN_VERSION_base(4,8,0)
import Control.Applicative
#endif
import Data.Char
import Data.Maybe
import Text.XmlHtml.Common
import Data.Text (Text)
import qualified Data.Text as T
import qualified Text.Parsec as P
import Text.Parsec.Text
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
guessEncoding :: ByteString -> (Encoding, ByteString)
guessEncoding b
| B.take 3 b == B.pack [ 0xEF, 0xBB, 0xBF ] = (UTF8, B.drop 3 b)
| B.take 2 b == B.pack [ 0xFE, 0xFF ] = (UTF16BE, B.drop 2 b)
| B.take 2 b == B.pack [ 0xFF, 0xFE ] = (UTF16LE, B.drop 2 b)
| otherwise = (UTF8, b)
parse :: (Encoding -> Parser a) -> String -> ByteString -> Either String a
parse p src b = let (e, b') = guessEncoding b
t = decoder e b'
bad = T.find (not . isValidChar) t
in if isNothing bad
then parseText (p e <* P.eof) src t
else Left $ "Document contains invalid character:"
++ " \\" ++ show (ord (fromJust bad))
isValidChar :: Char -> Bool
isValidChar c | c < '\x9' = False
| c > '\xA' && c < '\xD' = False
| c > '\xD' && c < '\x20' = False
| c > '\xD7FF' && c < '\xE000' = False
| c > '\xFFFD' && c < '\x10000' = False
| otherwise = True
parseText :: Parser a
-> String
-> Text
-> Either String a
parseText p src t = inLeft show (P.parse p src t)
where inLeft :: (a -> b) -> Either a c -> Either b c
inLeft f (Left x) = Left (f x)
inLeft _ (Right x) = Right x
takeWhile0 :: (Char -> Bool) -> Parser Text
takeWhile0 p = fmap T.pack $ P.many $ P.satisfy p
takeWhile1 :: (Char -> Bool) -> Parser Text
takeWhile1 p = fmap T.pack $ P.many1 $ P.satisfy p
text :: Text -> Parser Text
text t = P.try $ P.string (T.unpack t) *> return t
data ScanState = ScanNext (Char -> ScanState)
| ScanFinish
| ScanFail String
scanText :: (Char -> ScanState) -> Parser String
scanText f = do
P.try $ do
c <- P.anyChar
case f c of
ScanNext f' -> (c:) `fmap` scanText f'
ScanFinish -> return [c]
ScanFail err -> fail err