{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}

-- | HTML entity decoding.

module Text.HTML.TagStream.Entities
  (Dec(..)
  ,isNameChar
  ,isNameStart
  ,decodeEntities)
  where

import Data.Char
import Data.Monoid
import Data.String
import Data.Conduit
import Text.HTML.TagStream.Types

import qualified Data.Conduit.List as CL
import Data.Maybe (fromMaybe, isJust)
import Control.Arrow (first,second)

-- | A conduit to decode entities from a stream of tokens into a new stream of tokens.
decodeEntities :: (Monad m
                  ,Monoid builder
                  ,Monoid string
                  ,IsString string
                  ,Eq string)
               => Dec builder string
               -> Conduit (Token' string) m (Token' string)
decodeEntities :: Dec builder string -> Conduit (Token' string) m (Token' string)
decodeEntities dec :: Dec builder string
dec =
    Conduit (Token' string) m (Token' string)
start
  where
    start :: Conduit (Token' string) m (Token' string)
start = ConduitT (Token' string) (Token' string) m (Maybe (Token' string))
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await ConduitT (Token' string) (Token' string) m (Maybe (Token' string))
-> (Maybe (Token' string)
    -> Conduit (Token' string) m (Token' string))
-> Conduit (Token' string) m (Token' string)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Conduit (Token' string) m (Token' string)
-> (Token' string -> Conduit (Token' string) m (Token' string))
-> Maybe (Token' string)
-> Conduit (Token' string) m (Token' string)
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (() -> Conduit (Token' string) m (Token' string)
forall (m :: * -> *) a. Monad m => a -> m a
return ()) (\token :: Token' string
token -> Token' string -> Conduit (Token' string) m (Token' string)
forall (m :: * -> *).
Monad m =>
Token' string -> ConduitT (Token' string) (Token' string) m ()
start' Token' string
token Conduit (Token' string) m (Token' string)
-> Conduit (Token' string) m (Token' string)
-> Conduit (Token' string) m (Token' string)
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> Conduit (Token' string) m (Token' string)
start)
    start' :: Token' string -> ConduitT (Token' string) (Token' string) m ()
start' (Text t :: string
t) = (string -> ConduitT (Token' string) string m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield string
t ConduitT (Token' string) string m ()
-> ConduitT (Token' string) string m ()
-> ConduitT (Token' string) string m ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ConduitT (Token' string) string m ()
forall (m :: * -> *) string.
Monad m =>
Conduit (Token' string) m string
yieldWhileText) ConduitT (Token' string) string m ()
-> ConduitT string (Token' string) m ()
-> ConduitT (Token' string) (Token' string) m ()
forall (m :: * -> *) a b c r.
Monad m =>
Conduit a m b -> ConduitT b c m r -> ConduitT a c m r
=$= Dec builder string -> Conduit string m string
forall (m :: * -> *) string builder.
(Monad m, Monoid string, IsString string, Monoid builder,
 Eq string) =>
Dec builder string -> Conduit string m string
decodeEntities' Dec builder string
dec Conduit string m string
-> ConduitT string (Token' string) m ()
-> ConduitT string (Token' string) m ()
forall (m :: * -> *) a b c r.
Monad m =>
Conduit a m b -> ConduitT b c m r -> ConduitT a c m r
=$= (string -> Maybe (Token' string))
-> ConduitT string (Token' string) m ()
forall (m :: * -> *) a b.
Monad m =>
(a -> Maybe b) -> ConduitT a b m ()
CL.mapMaybe string -> Maybe (Token' string)
forall s. (Eq s, IsString s) => s -> Maybe (Token' s)
go
    start' (TagOpen name :: string
name attrs :: [Attr' string]
attrs bool :: Bool
bool) = Token' string -> ConduitT (Token' string) (Token' string) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (string -> [Attr' string] -> Bool -> Token' string
forall s. s -> [Attr' s] -> Bool -> Token' s
TagOpen string
name ((Attr' string -> Attr' string) -> [Attr' string] -> [Attr' string]
forall a b. (a -> b) -> [a] -> [b]
map ((string -> string) -> Attr' string -> Attr' string
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (d, b) (d, c)
second (Dec builder string -> string -> string
forall a builder.
(Eq a, IsString a, Monoid builder, Monoid a) =>
Dec builder a -> a -> a
decodeString Dec builder string
dec)) [Attr' string]
attrs) Bool
bool)
    start' token :: Token' string
token = Token' string -> ConduitT (Token' string) (Token' string) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield Token' string
token

    go :: s -> Maybe (Token' s)
go t :: s
t
        | s
t s -> s -> Bool
forall a. Eq a => a -> a -> Bool
== ""   = Maybe (Token' s)
forall a. Maybe a
Nothing
        | Bool
otherwise = Token' s -> Maybe (Token' s)
forall a. a -> Maybe a
Just (s -> Token' s
forall s. s -> Token' s
Text s
t)

-- | Decode entities in a complete string.
decodeString
  :: (Eq a, IsString a, Monoid builder, Monoid a)
  => Dec builder a -> a -> a
decodeString :: Dec builder a -> a -> a
decodeString dec :: Dec builder a
dec input :: a
input =
  case Dec builder a -> a -> (a, a)
forall string builder.
(IsString string, Monoid builder, Eq string, Monoid string) =>
Dec builder string -> string -> (string, string)
makeEntityDecoder Dec builder a
dec a
input of
    (value' :: a
value', remainder :: a
remainder)
      | a
value' a -> a -> Bool
forall a. Eq a => a -> a -> Bool
/= a
forall a. Monoid a => a
mempty -> a
value' a -> a -> a
forall a. Semigroup a => a -> a -> a
<> Dec builder a -> a -> a
forall a builder.
(Eq a, IsString a, Monoid builder, Monoid a) =>
Dec builder a -> a -> a
decodeString Dec builder a
dec a
remainder
      | Bool
otherwise -> a
input

decodeEntities' :: (Monad m
                   ,Monoid string
                   ,IsString string
                   ,Monoid builder
                   ,Eq string)
                => Dec builder string
                -> Conduit string m string
decodeEntities' :: Dec builder string -> Conduit string m string
decodeEntities' dec :: Dec builder string
dec =
    (string -> string) -> Conduit string m string
forall (m :: * -> *).
Monad m =>
(string -> string) -> ConduitT string string m ()
loop string -> string
forall a. a -> a
id
  where
    loop :: (string -> string) -> ConduitT string string m ()
loop accum :: string -> string
accum = do
        Maybe string
mchunk <- ConduitT string string m (Maybe string)
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await
        let chunk :: string
chunk = string -> string
accum (string -> string) -> string -> string
forall a b. (a -> b) -> a -> b
$ string -> Maybe string -> string
forall a. a -> Maybe a -> a
fromMaybe string
forall a. Monoid a => a
mempty Maybe string
mchunk
            (newStr :: string
newStr, remainder :: string
remainder) = Dec builder string -> string -> (string, string)
forall string builder.
(IsString string, Monoid builder, Eq string, Monoid string) =>
Dec builder string -> string -> (string, string)
makeEntityDecoder Dec builder string
dec string
chunk
        string -> ConduitT string string m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield string
newStr
        if Maybe string -> Bool
forall a. Maybe a -> Bool
isJust Maybe string
mchunk
            then (string -> string) -> ConduitT string string m ()
loop (string -> string -> string
forall a. Monoid a => a -> a -> a
mappend string
remainder)
            else string -> ConduitT string string m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield string
remainder

-- | Yield contiguous text tokens as strings.
yieldWhileText :: Monad m => Conduit (Token' string) m string
yieldWhileText :: Conduit (Token' string) m string
yieldWhileText =
    Conduit (Token' string) m string
forall o. ConduitT (Token' o) o m ()
loop
  where
    loop :: ConduitT (Token' o) o m ()
loop = ConduitT (Token' o) o m (Maybe (Token' o))
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await ConduitT (Token' o) o m (Maybe (Token' o))
-> (Maybe (Token' o) -> ConduitT (Token' o) o m ())
-> ConduitT (Token' o) o m ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= ConduitT (Token' o) o m ()
-> (Token' o -> ConduitT (Token' o) o m ())
-> Maybe (Token' o)
-> ConduitT (Token' o) o m ()
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (() -> ConduitT (Token' o) o m ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()) Token' o -> ConduitT (Token' o) o m ()
go
    go :: Token' o -> ConduitT (Token' o) o m ()
go (Text t :: o
t) = o -> ConduitT (Token' o) o m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield o
t ConduitT (Token' o) o m ()
-> ConduitT (Token' o) o m () -> ConduitT (Token' o) o m ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ConduitT (Token' o) o m ()
loop
    go token :: Token' o
token = Token' o -> ConduitT (Token' o) o m ()
forall i o (m :: * -> *). i -> ConduitT i o m ()
leftover Token' o
token

-- | A decoder.
data Dec builder string = Dec
  { Dec builder string -> builder -> string
decToS     :: builder -> string
  , Dec builder string -> (Char -> Bool) -> string -> (string, string)
decBreak   :: (Char -> Bool) -> string -> (string,string)
  , Dec builder string -> string -> builder
decBuilder :: string -> builder
  , Dec builder string -> Int -> string -> string
decDrop    :: Int -> string -> string
  , Dec builder string -> string -> Maybe string
decEntity  :: string -> Maybe string
  , Dec builder string -> string -> Maybe (Char, string)
decUncons  :: string -> Maybe (Char,string)
  }

-- | Decode the entities in a string type with a decoder.
makeEntityDecoder :: (IsString string,Monoid builder,Eq string,Monoid string)
                  => Dec builder string -> string -> (string, string)
makeEntityDecoder :: Dec builder string -> string -> (string, string)
makeEntityDecoder Dec{..} = (builder -> string) -> (builder, string) -> (string, string)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first builder -> string
decToS ((builder, string) -> (string, string))
-> (string -> (builder, string)) -> string -> (string, string)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. string -> (builder, string)
go
  where
    go :: string -> (builder, string)
go s :: string
s =
      case (Char -> Bool) -> string -> (string, string)
decBreak (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
=='&') string
s of
        (_,"") -> (string -> builder
decBuilder string
s, "")
        (before :: string
before,restPlusAmp :: string
restPlusAmp@(Int -> string -> string
decDrop 1 -> string
rest)) ->
          case (Char -> Bool) -> string -> (string, string)
decBreak (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (\c :: Char
c -> Char -> Bool
isNameChar Char
c Bool -> Bool -> Bool
|| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '#')) string
rest of
            (_,"") -> (string -> builder
decBuilder string
before, string
restPlusAmp)
            (entity :: string
entity,after :: string
after) -> (builder
before1 builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<> builder
before2, string
after')
              where
                before1 :: builder
before1 = string -> builder
decBuilder string
before
                (before2 :: builder
before2, after' :: string
after') =
                  case Maybe string
mdecoded of
                    Nothing -> (builder -> builder) -> (builder, string) -> (builder, string)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first ((string -> builder
decBuilder "&" builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<> string -> builder
decBuilder string
entity) builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<>) (string -> (builder, string)
go string
after)
                    Just (string -> builder
decBuilder -> builder
decoded) ->
                      case string -> Maybe (Char, string)
decUncons string
after of
                        Just (';',validAfter :: string
validAfter) -> (builder -> builder) -> (builder, string) -> (builder, string)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first (builder
decoded builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<>) (string -> (builder, string)
go string
validAfter)
                        Just (_invalid :: Char
_invalid,_rest :: string
_rest) -> (builder -> builder) -> (builder, string) -> (builder, string)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first (builder
decoded builder -> builder -> builder
forall a. Semigroup a => a -> a -> a
<>) (string -> (builder, string)
go string
after)
                        Nothing -> (builder
forall a. Monoid a => a
mempty, string
s)
                mdecoded :: Maybe string
mdecoded =
                  if string
entity string -> string -> Bool
forall a. Eq a => a -> a -> Bool
== string
forall a. Monoid a => a
mempty
                     then Maybe string
forall a. Maybe a
Nothing
                     else string -> Maybe string
decEntity string
entity

-- | Is the character a valid Name starter?
isNameStart :: Char -> Bool
isNameStart :: Char -> Bool
isNameStart c :: Char
c =
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== ':' Bool -> Bool -> Bool
||
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '_' Bool -> Bool -> Bool
||
  Char -> Bool
isAsciiUpper Char
c Bool -> Bool -> Bool
||
  Char -> Bool
isAsciiLower Char
c Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\xC0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xD6') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\xD8' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xF6') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\xF8' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x2FF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x370' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x37D') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x37F' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x1FFF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x200C' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x200D') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x2070' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x218F') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x2C00' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x2FEF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x3001' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xD7FF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\xF900' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xFDCF') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\xFDF0' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xFFFD') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x10000' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\xEFFFF')

-- | Is the character valid in a Name?
isNameChar :: Char -> Bool
isNameChar :: Char -> Bool
isNameChar c :: Char
c =
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '-' Bool -> Bool -> Bool
||
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '.' Bool -> Bool -> Bool
||
  Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== '\xB7' Bool -> Bool -> Bool
||
  Char -> Bool
isDigit Char
c Bool -> Bool -> Bool
||
  Char -> Bool
isNameStart Char
c Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x0300' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x036F') Bool -> Bool -> Bool
||
  (Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
>= '\x203F' Bool -> Bool -> Bool
&& Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= '\x2040')