Sean Bamforth HTML Parser

Created by Sean Bamforth
Use UI

Struct tAttrPair
    String sAttr 
    String sValue 
    Boolean isAssigned
End_Struct

Struct tTagData 
    String sValue
    tAttrPair[] aAttributes
    Boolean isAloneTag
End_Struct

Struct tParsedHTML 
    String sType 
    String sValue
    tTagData tagdata
End_Struct

Class cHTMLParser is a cObject 
    Function isHTMLSpace String sInp Returns Boolean 
        Function_Return ((Trim(sInp))="")
    End_Function
    
    Function tokenizeTags String sText Returns String[]
        tTagData tagData  
        Integer iPos 
        Integer iMax 
        String sType 
        String sCurrentToken 
        Pointer pText 
        String sChar 
        String[] Tokens

        Move "" to sCurrentToken 
        Move "text" to sType 
        
        Move (AddressOf(sText)) to pText
        Move (Length(sText)) to iMax
        For iPos from 0 to (iMax-1)
            Move (Character(DeRefC(pText,iPos))) to sChar  
            
            If ((sType = "space") and (sChar = '"') ) Begin 
                Move "double-quote" to sType
            End
            Else If ((sType = "space") and (sChar = "'") ) Begin 
                Move "single-quote" to sType
            End
            Else If ((sType = "space") and (sChar = "=") ) Begin 
                Move "space" to sType 
                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
                Move "=" to Tokens[(SizeOfArray(Tokens))]
                Move "" to sCurrentToken
            End
            Else If ((sType = "space") and (not(isHTMLSpace(Self,sChar))) ) Begin 
                Move "text" to sType 
                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
                Move sChar to sCurrentToken
            End
            Else If ((sType = "text") and (isHTMLSpace(Self,sChar)) ) Begin 
                Move "space" to sType 
                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
                Move "" to sCurrentToken
            End
            Else If ((sType = "text") and (sChar = "=") ) Begin 
                Move "space" to sType 
                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
                Move "=" to Tokens[(SizeOfArray(Tokens))]
                Move "" to sCurrentToken
            End
            Else If ((sType="single-quote") and (sChar="'")) Begin 
                Move "space" to sType 
                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
                Move "" to sCurrentToken
            End
            Else If ((sType="double-quote") and (sChar='"')) Begin 
                Move "space" to sType 
                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
                Move "" to sCurrentToken
            End
            Else Append sCurrentToken sChar 
        Loop
        If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]

        Function_Return Tokens 
    End_Function
    
    Function ParseHTMLTag String sText Returns tTagData
        Integer iMax 
        Integer iPos 
        String[] Tokens
        tTagData tagData 
        Integer iAttrNo
        
        //strip angle brackets......
        Move (Length(sText)) to iMax
        Move (Mid(sText,(iMax-2),2)) to sText 
        
        Move (False) to tagData.isAloneTag
        If ((right(sText,1))="/") Begin 
            Move (True) to tagData.isAloneTag
            Move (Length(sText)) to iMax
            Move (Left(sText,(iMax-1))) to sText 
        End
        
        Move (tokenizeTags(Self,sText)) to Tokens        
        
        //we have the tokens
        Move (SizeOfArray(Tokens)) to iMax 
        Move "" to Tokens[iMax+1] //stop any overflow when looking ahead. 
        
        Move Tokens[0] to tagData.sValue
        Move 1 to iPos 
        Move 0 to iAttrNo
        While (iPos < iMax)
            If (Tokens[iPos] <> "") Begin 
                Move Tokens[iPos] to tagData.aAttributes[iAttrNo].sAttr
                If (Tokens[iPos + 1] = "=") Begin 
                    Move Tokens[iPos+2] to tagData.aAttributes[iAttrNo].sValue
                    Move (iPos+2) to iPos 
                    Move True to tagData.aAttributes[iAttrNo].isAssigned
                End
                Else Move False to tagData.aAttributes[iAttrNo].isAssigned
                
                Move (iAttrNo+1) to iAttrNo
            End
            Move (iPos +1) to iPos 
        Loop
        
        Function_Return tagData 
    End_Function
    
    Function ParseHTML String sText Returns tParsedHTML[]
        String sType 
        String sNewType 
        tParsedHTML[] aParsed
        String sChar
        String sCurrentToken
        Integer iMax 
        Integer iPos 
        Char[] acHTML
        Pointer pText
        Integer iChar 
        Integer iTokenPos
        
        Move "text" to sType 
        Move "" to sCurrentToken
        
        Move (AddressOf(sText)) to pText
        Move (Length(sText)) to iMax
    
        For iPos from 0 to (iMax-1)
            Move (Character(DeRefC(pText,iPos))) to sChar 
            
            If ((sChar = "<") and (sType="text")) Begin 
                Move (Trim(sCurrentToken)) to sCurrentToken
                If (sCurrentToken <>"") Begin 
                    Move (SizeOfArray(aParsed)) to iTokenPos
                    Move sCurrentToken to aParsed[iTokenPos].sValue
                    Move sType to aParsed[iTokenPos].sType
                End
                Move "" to sCurrentToken
                Append sCurrentToken sChar
                Move "tag" to sType 
            End
            Else If ((sChar = ">") and (sType="tag")) Begin 
                Append sCurrentToken sChar
                Move (SizeOfArray(aParsed)) to iTokenPos
                Move sCurrentToken to aParsed[iTokenPos].sValue
                Move sType to aParsed[iTokenPos].sType
                Move "text" to sType 
                Move "" to sCurrentToken
            End
            Else Begin 
                Append sCurrentToken sChar 
            End
            
        Loop
        
        Move (Trim(sCurrentToken)) to sCurrentToken
        If (sCurrentToken<>"") Begin 
            Move (SizeOfArray(aParsed)) to iTokenPos
            Move sCurrentToken to aParsed[iTokenPos].sValue
            Move sType to aParsed[ iTokenPos ].sType
        End
        
        Move (SizeOfArray(aParsed)) to iMax 
        For iPos from 0 to (iMax-1) 
            If (aParsed[iPos].sType = "tag") Get ParseHTMLTag aParsed[iPos].sValue to aParsed[iPos].tagData
        Loop
        
        Function_Return aParsed
    End_Function
    
    Function SplitHTML String sInp Returns tParsedHTML[]
        Function_Return (ParseHTML(Self,sInp))
    End_Function
    
    Function QuoteIfNeeded String sText Returns String 
        Pointer pText 
        Integer iPos 
        Integer iMax 
        String sChar 
        Boolean isQuotable 
        String sQuoteCode
            
        Move (False) to isQuotable
        Move '"' to sQuoteCode
        
        Move (AddressOf(sText)) to pText
        Move (Length(sText)) to iMax
        For iPos from 0 to (iMax-1)
            Move (Character(DeRefC(pText,iPos))) to sChar  
            Move (Lowercase(sChar)) to sChar 
            If ((pos(sChar,"abcdefghijlkmnopqrstuvwxyz01234567890"))=0) Move (True) to isQuotable
            If (sChar = '"') Move "'" to sQuoteCode
        Loop
        
        If (isQuotable) Move (sQuoteCode+sText+sQuoteCode) to sText 
        Function_Return sText 
    End_Function
    
    Function TagToHtml tTagData tag Returns String 
        String sRet 
        Integer iPos 
        Integer iMax 
        
        Move "<" to sRet 
        Append sRet tag.sValue
        
        Move (SizeOfArray(tag.aAttributes)) to iMax 
        For iPos from 0 to (iMax-1) 
            Append sRet " " tag.aAttributes[iPos].sAttr 
            If (tag.aAttributes[iPos].isAssigned) Begin 
                Append sRet "=" (QuoteIfNeeded(Self,tag.aAttributes[iPos].sValue))
            End
        Loop
        
        If (tag.isAloneTag) Append sRet "/>"
        Else Append sRet ">"
        
        Function_Return sRet 
    End_Function 
    
    Function toHTML tParsedHTML[] Parsed Returns String 
        Integer iPos 
        Integer iMax 
        String sRet 
        tParsedHTML HtmlFragment
        
        Move (SizeOfArray(Parsed)) to iMax
        For iPos from 0 to (iMax-1)
            Move parsed[iPos] to HtmlFragment
            If (HtmlFragment.sType = "tag") Append sRet (TagToHtml(Self,HtmlFragment.tagdata))
            If (HtmlFragment.sType = "text") Append sRet HtmlFragment.sValue
        Loop
        
        Function_Return sRet 
    End_Function 
    
    Function toFlatHTML tParsedHTML[] Parsed Returns String 
        Integer iPos 
        Integer iMax 
        String sRet 
        tParsedHTML HtmlFragment
        
        Move (SizeOfArray(Parsed)) to iMax
        For iPos from 0 to (iMax-1)
            Move parsed[iPos] to HtmlFragment
            
            If (HtmlFragment.sType = "tag") Append sRet (TagToHtml(Self,HtmlFragment.tagdata))
            If (HtmlFragment.sType = "text") Append sRet HtmlFragment.sValue
            Append sRet CR_LF
        Loop
        
        Showln (Length(sRet))
        Function_Return sRet 
    End_Function 
    
End_Class

Comments (0)