{-# LANGUAGE RecordWildCards, PatternGuards #-} module Text.HTML.TagSoup.Specification(parse) where import Text.HTML.TagSoup.Implementation import Data.Char -- We make some generalisations: -- <!name is a valid tag start closed by > -- <?name is a valid tag start closed by ?> -- </!name> is a valid closing tag -- </?name> is a valid closing tag -- <a "foo"> is a valid tag attibute in ! and ?, i.e missing an attribute name -- We also don't do lowercase conversion -- Entities are handled without a list of known entity names -- We don't have RCData, CData or Escape modes (only effects dat and tagOpen) data TypeTag = TypeNormal -- <foo | TypeXml -- <?foo | TypeDecl -- <!foo | TypeScript -- <script deriving Eq -- 2.4.1 Common parser idioms white x = x `elem` " \t\n\f\r" -- 8.2.4 Tokenization type Parser = S -> [Out] parse :: String -> [Out] parse = dat . state -- 8.2.4.1 Data state dat :: Parser dat S{..} = pos $ case hd of '&' -> charReference tl '<' -> tagOpen tl _ | eof -> [] _ -> hd & dat tl -- 8.2.4.2 Character reference data state charReference s = charRef dat False Nothing s -- 8.2.4.3 Tag open state tagOpen S{..} = case hd of '!' -> markupDeclOpen tl '/' -> closeTagOpen tl _ | isAlpha hd -> Tag & hd & tagName (if isScript s then TypeScript else TypeNormal) tl '>' -> errSeen "<>" & '<' & '>' & dat tl '?' -> neilXmlTagOpen tl -- NEIL _ -> errSeen "<" & '<' & dat s isScript = f "script" where f (c:cs) S{..} = toLower hd == c && f cs tl f [] S{..} = white hd || hd == '/' || hd == '>' || hd == '?' || eof -- seen "<?", emitted [] neilXmlTagOpen S{..} = pos $ case hd of _ | isAlpha hd -> Tag & '?' & hd & tagName TypeXml tl _ -> errSeen "<?" & '<' & '?' & dat s -- seen "?", expecting ">" neilXmlTagClose S{..} = pos $ case hd of '>' -> TagEnd & dat tl _ -> errSeen "?" & beforeAttName TypeXml s -- just seen ">" at the end, am given tl neilTagEnd typ S{..} | typ == TypeXml = pos $ errWant "?>" & TagEnd & dat s | typ == TypeScript = pos $ TagEnd & neilScriptBody s | otherwise = pos $ TagEnd & dat s -- Inside a <script> tag, only break on </script neilScriptBody o@S{..} | hd == '<', S{..} <- tl , hd == '/', S{..} <- tl , isScript s = dat o | eof = [] | otherwise = pos $ hd & neilScriptBody tl -- 8.2.4.4 Close tag open state -- Deviation: We ignore the if CDATA/RCDATA bits and tag matching -- Deviation: On </> we output </> to the text -- Deviation: </!name> is a closing tag, not a bogus comment closeTagOpen S{..} = case hd of _ | isAlpha hd || hd `elem` "?!" -> TagShut & hd & tagName TypeNormal tl '>' -> errSeen "</>" & '<' & '/' & '>' & dat tl _ | eof -> '<' & '/' & dat s _ -> errWant "tag name" & bogusComment s -- 8.2.4.5 Tag name state tagName typ S{..} = pos $ case hd of _ | white hd -> beforeAttName typ tl '/' -> selfClosingStartTag typ tl '>' -> neilTagEnd typ tl '?' | typ == TypeXml -> neilXmlTagClose tl _ | isAlpha hd -> hd & tagName typ tl _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s _ -> hd & tagName typ tl -- 8.2.4.6 Before attribute name state beforeAttName typ S{..} = pos $ case hd of _ | white hd -> beforeAttName typ tl '/' -> selfClosingStartTag typ tl '>' -> neilTagEnd typ tl '?' | typ == TypeXml -> neilXmlTagClose tl _ | typ /= TypeNormal && hd `elem` "\'\"" -> beforeAttValue typ s -- NEIL _ | hd `elem` "\"'<=" -> errSeen [hd] & AttName & hd & attName typ tl _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s _ -> AttName & hd & attName typ tl -- 8.2.4.7 Attribute name state attName typ S{..} = pos $ case hd of _ | white hd -> afterAttName typ tl '/' -> selfClosingStartTag typ tl '=' -> beforeAttValue typ tl '>' -> neilTagEnd typ tl '?' | typ == TypeXml -> neilXmlTagClose tl _ | hd `elem` "\"'<" -> errSeen [hd] & def _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s _ -> def where def = hd & attName typ tl -- 8.2.4.8 After attribute name state afterAttName typ S{..} = pos $ case hd of _ | white hd -> afterAttName typ tl '/' -> selfClosingStartTag typ tl '=' -> beforeAttValue typ tl '>' -> neilTagEnd typ tl '?' | typ == TypeXml -> neilXmlTagClose tl _ | typ /= TypeNormal && hd `elem` "\"'" -> AttVal & beforeAttValue typ s -- NEIL _ | hd `elem` "\"'<" -> errSeen [hd] & def _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s _ -> def where def = AttName & hd & attName typ tl -- 8.2.4.9 Before attribute value state beforeAttValue typ S{..} = pos $ case hd of _ | white hd -> beforeAttValue typ tl '\"' -> AttVal & attValueDQuoted typ tl '&' -> AttVal & attValueUnquoted typ s '\'' -> AttVal & attValueSQuoted typ tl '>' -> errSeen "=" & neilTagEnd typ tl '?' | typ == TypeXml -> neilXmlTagClose tl _ | hd `elem` "<=" -> errSeen [hd] & def _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s _ -> def where def = AttVal & hd & attValueUnquoted typ tl -- 8.2.4.10 Attribute value (double-quoted) state attValueDQuoted typ S{..} = pos $ case hd of '\"' -> afterAttValueQuoted typ tl '&' -> charRefAttValue (attValueDQuoted typ) (Just '\"') tl _ | eof -> errWant "\"" & dat s _ -> hd & attValueDQuoted typ tl -- 8.2.4.11 Attribute value (single-quoted) state attValueSQuoted typ S{..} = pos $ case hd of '\'' -> afterAttValueQuoted typ tl '&' -> charRefAttValue (attValueSQuoted typ) (Just '\'') tl _ | eof -> errWant "\'" & dat s _ -> hd & attValueSQuoted typ tl -- 8.2.4.12 Attribute value (unquoted) state attValueUnquoted typ S{..} = pos $ case hd of _ | white hd -> beforeAttName typ tl '&' -> charRefAttValue (attValueUnquoted typ) Nothing tl '>' -> neilTagEnd typ tl '?' | typ == TypeXml -> neilXmlTagClose tl _ | hd `elem` "\"'<=" -> errSeen [hd] & def _ | eof -> errWant (if typ == TypeXml then "?>" else ">") & dat s _ -> def where def = hd & attValueUnquoted typ tl -- 8.2.4.13 Character reference in attribute value state charRefAttValue :: Parser -> Maybe Char -> Parser charRefAttValue resume c s = charRef resume True c s -- 8.2.4.14 After attribute value (quoted) state afterAttValueQuoted typ S{..} = pos $ case hd of _ | white hd -> beforeAttName typ tl '/' -> selfClosingStartTag typ tl '>' -> neilTagEnd typ tl '?' | typ == TypeXml -> neilXmlTagClose tl _ | eof -> dat s _ -> errSeen [hd] & beforeAttName typ s -- 8.2.4.15 Self-closing start tag state selfClosingStartTag typ S{..} = pos $ case hd of _ | typ == TypeXml -> errSeen "/" & beforeAttName typ s '>' -> TagEndClose & dat tl _ | eof -> errWant ">" & dat s _ -> errSeen "/" & beforeAttName typ s -- 8.2.4.16 Bogus comment state bogusComment S{..} = Comment & bogusComment1 s bogusComment1 S{..} = pos $ case hd of '>' -> CommentEnd & dat tl _ | eof -> CommentEnd & dat s _ -> hd & bogusComment1 tl -- 8.2.4.17 Markup declaration open state markupDeclOpen S{..} = pos $ case hd of _ | Just s <- next "--" -> Comment & commentStart s _ | isAlpha hd -> Tag & '!' & hd & tagName TypeDecl tl -- NEIL _ | Just s <- next "[CDATA[" -> cdataSection s _ -> errWant "tag name" & bogusComment s -- 8.2.4.18 Comment start state commentStart S{..} = pos $ case hd of '-' -> commentStartDash tl '>' -> errSeen "<!-->" & CommentEnd & dat tl _ | eof -> errWant "-->" & CommentEnd & dat s _ -> hd & comment tl -- 8.2.4.19 Comment start dash state commentStartDash S{..} = pos $ case hd of '-' -> commentEnd tl '>' -> errSeen "<!--->" & CommentEnd & dat tl _ | eof -> errWant "-->" & CommentEnd & dat s _ -> '-' & hd & comment tl -- 8.2.4.20 Comment state comment S{..} = pos $ case hd of '-' -> commentEndDash tl _ | eof -> errWant "-->" & CommentEnd & dat s _ -> hd & comment tl -- 8.2.4.21 Comment end dash state commentEndDash S{..} = pos $ case hd of '-' -> commentEnd tl _ | eof -> errWant "-->" & CommentEnd & dat s _ -> '-' & hd & comment tl -- 8.2.4.22 Comment end state commentEnd S{..} = pos $ case hd of '>' -> CommentEnd & dat tl '-' -> errWant "-->" & '-' & commentEnd tl _ | white hd -> errSeen "--" & '-' & '-' & hd & commentEndSpace tl '!' -> errSeen "!" & commentEndBang tl _ | eof -> errWant "-->" & CommentEnd & dat s _ -> errSeen "--" & '-' & '-' & hd & comment tl -- 8.2.4.23 Comment end bang state commentEndBang S{..} = pos $ case hd of '>' -> CommentEnd & dat tl '-' -> '-' & '-' & '!' & commentEndDash tl _ | eof -> errWant "-->" & CommentEnd & dat s _ -> '-' & '-' & '!' & hd & comment tl -- 8.2.4.24 Comment end space state commentEndSpace S{..} = pos $ case hd of '>' -> CommentEnd & dat tl '-' -> commentEndDash tl _ | white hd -> hd & commentEndSpace tl _ | eof -> errWant "-->" & CommentEnd & dat s _ -> hd & comment tl -- 8.2.4.38 CDATA section state cdataSection S{..} = pos $ case hd of _ | Just s <- next "]]>" -> dat s _ | eof -> dat s _ | otherwise -> hd & cdataSection tl -- 8.2.4.39 Tokenizing character references -- Change from spec: this is reponsible for writing '&' if nothing is to be written charRef :: Parser -> Bool -> Maybe Char -> S -> [Out] charRef resume att end S{..} = pos $ case hd of _ | eof || hd `elem` "\t\n\f <&" || maybe False (== hd) end -> '&' & resume s '#' -> charRefNum resume s tl _ -> charRefAlpha resume att s charRefNum resume o S{..} = pos $ case hd of _ | hd `elem` "xX" -> charRefNum2 resume o True tl _ -> charRefNum2 resume o False s charRefNum2 resume o hex S{..} = pos $ case hd of _ | hexChar hex hd -> (if hex then EntityHex else EntityNum) & hd & charRefNum3 resume hex tl _ -> errSeen "&" & '&' & resume o charRefNum3 resume hex S{..} = pos $ case hd of _ | hexChar hex hd -> hd & charRefNum3 resume hex tl ';' -> EntityEnd True & resume tl _ -> EntityEnd False & errWant ";" & resume s charRefAlpha resume att S{..} = pos $ case hd of _ | isAlpha hd -> EntityName & hd & charRefAlpha2 resume att tl _ -> errSeen "&" & '&' & resume s charRefAlpha2 resume att S{..} = pos $ case hd of _ | alphaChar hd -> hd & charRefAlpha2 resume att tl ';' -> EntityEnd True & resume tl _ | att -> EntityEnd False & resume s _ -> EntityEnd False & errWant ";" & resume s alphaChar x = isAlphaNum x || x `elem` ":-_" hexChar False x = isDigit x hexChar True x = isDigit x || (x >= 'a' && x <= 'f') || (x >= 'A' && x <= 'F')