Snippets

SeanB HTML Parser

Created by SeanB

File HTMLParse.pkg Added

  • Ignore whitespace
  • Hide word diff
+Use UI
+
+Struct tAttrPair
+    String sAttr 
+    String sValue 
+    Boolean isAssigned
+End_Struct
+
+Struct tTagData 
+    String sValue
+    tAttrPair[] aAttributes
+    Boolean isAloneTag
+End_Struct
+
+Struct tParsedHTML 
+    String sType 
+    String sValue
+    tTagData tagdata
+End_Struct
+
+Class cHTMLParser is a cObject 
+    Function isHTMLSpace String sInp Returns Boolean 
+        Function_Return ((Trim(sInp))="")
+    End_Function
+    
+    Function tokenizeTags String sText Returns String[]
+        tTagData tagData  
+        Integer iPos 
+        Integer iMax 
+        String sType 
+        String sCurrentToken 
+        Pointer pText 
+        String sChar 
+        String[] Tokens
+
+        Move "" to sCurrentToken 
+        Move "text" to sType 
+        
+        Move (AddressOf(sText)) to pText
+        Move (Length(sText)) to iMax
+        For iPos from 0 to (iMax-1)
+            Move (Character(DeRefC(pText,iPos))) to sChar  
+            
+            If ((sType = "space") and (sChar = '"') ) Begin 
+                Move "double-quote" to sType
+            End
+            Else If ((sType = "space") and (sChar = "'") ) Begin 
+                Move "single-quote" to sType
+            End
+            Else If ((sType = "space") and (sChar = "=") ) Begin 
+                Move "space" to sType 
+                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
+                Move "=" to Tokens[(SizeOfArray(Tokens))]
+                Move "" to sCurrentToken
+            End
+            Else If ((sType = "space") and (not(isHTMLSpace(Self,sChar))) ) Begin 
+                Move "text" to sType 
+                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
+                Move sChar to sCurrentToken
+            End
+            Else If ((sType = "text") and (isHTMLSpace(Self,sChar)) ) Begin 
+                Move "space" to sType 
+                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
+                Move "" to sCurrentToken
+            End
+            Else If ((sType = "text") and (sChar = "=") ) Begin 
+                Move "space" to sType 
+                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
+                Move "=" to Tokens[(SizeOfArray(Tokens))]
+                Move "" to sCurrentToken
+            End
+            Else If ((sType="single-quote") and (sChar="'")) Begin 
+                Move "space" to sType 
+                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
+                Move "" to sCurrentToken
+            End
+            Else If ((sType="double-quote") and (sChar='"')) Begin 
+                Move "space" to sType 
+                If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
+                Move "" to sCurrentToken
+            End
+            Else Append sCurrentToken sChar 
+        Loop
+        If (sCurrentToken<>"") Move sCurrentToken to Tokens[(SizeOfArray(Tokens))]
+
+        Function_Return Tokens 
+    End_Function
+    
+    Function ParseHTMLTag String sText Returns tTagData
+        Integer iMax 
+        Integer iPos 
+        String[] Tokens
+        tTagData tagData 
+        Integer iAttrNo
+        
+        //strip angle brackets......
+        Move (Length(sText)) to iMax
+        Move (Mid(sText,(iMax-2),2)) to sText 
+        
+        Move (False) to tagData.isAloneTag
+        If ((right(sText,1))="/") Begin 
+            Move (True) to tagData.isAloneTag
+            Move (Length(sText)) to iMax
+            Move (Left(sText,(iMax-1))) to sText 
+        End
+        
+        Move (tokenizeTags(Self,sText)) to Tokens        
+        
+        //we have the tokens
+        Move (SizeOfArray(Tokens)) to iMax 
+        Move "" to Tokens[iMax+1] //stop any overflow when looking ahead. 
+        
+        Move Tokens[0] to tagData.sValue
+        Move 1 to iPos 
+        Move 0 to iAttrNo
+        While (iPos < iMax)
+            If (Tokens[iPos] <> "") Begin 
+                Move Tokens[iPos] to tagData.aAttributes[iAttrNo].sAttr
+                If (Tokens[iPos + 1] = "=") Begin 
+                    Move Tokens[iPos+2] to tagData.aAttributes[iAttrNo].sValue
+                    Move (iPos+2) to iPos 
+                    Move True to tagData.aAttributes[iAttrNo].isAssigned
+                End
+                Else Move False to tagData.aAttributes[iAttrNo].isAssigned
+                
+                Move (iAttrNo+1) to iAttrNo
+            End
+            Move (iPos +1) to iPos 
+        Loop
+        
+        Function_Return tagData 
+    End_Function
+    
+    Function ParseHTML String sText Returns tParsedHTML[]
+        String sType 
+        String sNewType 
+        tParsedHTML[] aParsed
+        String sChar
+        String sCurrentToken
+        Integer iMax 
+        Integer iPos 
+        Char[] acHTML
+        Pointer pText
+        Integer iChar 
+        Integer iTokenPos
+        
+        Move "text" to sType 
+        Move "" to sCurrentToken
+        
+        Move (AddressOf(sText)) to pText
+        Move (Length(sText)) to iMax
+    
+        For iPos from 0 to (iMax-1)
+            Move (Character(DeRefC(pText,iPos))) to sChar 
+            
+            If ((sChar = "<") and (sType="text")) Begin 
+                Move (Trim(sCurrentToken)) to sCurrentToken
+                If (sCurrentToken <>"") Begin 
+                    Move (SizeOfArray(aParsed)) to iTokenPos
+                    Move sCurrentToken to aParsed[iTokenPos].sValue
+                    Move sType to aParsed[iTokenPos].sType
+                End
+                Move "" to sCurrentToken
+                Append sCurrentToken sChar
+                Move "tag" to sType 
+            End
+            Else If ((sChar = ">") and (sType="tag")) Begin 
+                Append sCurrentToken sChar
+                Move (SizeOfArray(aParsed)) to iTokenPos
+                Move sCurrentToken to aParsed[iTokenPos].sValue
+                Move sType to aParsed[iTokenPos].sType
+                Move "text" to sType 
+                Move "" to sCurrentToken
+            End
+            Else Begin 
+                Append sCurrentToken sChar 
+            End
+            
+        Loop
+        
+        Move (Trim(sCurrentToken)) to sCurrentToken
+        If (sCurrentToken<>"") Begin 
+            Move (SizeOfArray(aParsed)) to iTokenPos
+            Move sCurrentToken to aParsed[iTokenPos].sValue
+            Move sType to aParsed[ iTokenPos ].sType
+        End
+        
+        Move (SizeOfArray(aParsed)) to iMax 
+        For iPos from 0 to (iMax-1) 
+            If (aParsed[iPos].sType = "tag") Get ParseHTMLTag aParsed[iPos].sValue to aParsed[iPos].tagData
+        Loop
+        
+        Function_Return aParsed
+    End_Function
+    
+    Function SplitHTML String sInp Returns tParsedHTML[]
+        Function_Return (ParseHTML(Self,sInp))
+    End_Function
+    
+    Function QuoteIfNeeded String sText Returns String 
+        Pointer pText 
+        Integer iPos 
+        Integer iMax 
+        String sChar 
+        Boolean isQuotable 
+        String sQuoteCode
+            
+        Move (False) to isQuotable
+        Move '"' to sQuoteCode
+        
+        Move (AddressOf(sText)) to pText
+        Move (Length(sText)) to iMax
+        For iPos from 0 to (iMax-1)
+            Move (Character(DeRefC(pText,iPos))) to sChar  
+            Move (Lowercase(sChar)) to sChar 
+            If ((pos(sChar,"abcdefghijlkmnopqrstuvwxyz01234567890"))=0) Move (True) to isQuotable
+            If (sChar = '"') Move "'" to sQuoteCode
+        Loop
+        
+        If (isQuotable) Move (sQuoteCode+sText+sQuoteCode) to sText 
+        Function_Return sText 
+    End_Function
+    
+    Function TagToHtml tTagData tag Returns String 
+        String sRet 
+        Integer iPos 
+        Integer iMax 
+        
+        Move "<" to sRet 
+        Append sRet tag.sValue
+        
+        Move (SizeOfArray(tag.aAttributes)) to iMax 
+        For iPos from 0 to (iMax-1) 
+            Append sRet " " tag.aAttributes[iPos].sAttr 
+            If (tag.aAttributes[iPos].isAssigned) Begin 
+                Append sRet "=" (QuoteIfNeeded(Self,tag.aAttributes[iPos].sValue))
+            End
+        Loop
+        
+        If (tag.isAloneTag) Append sRet "/>"
+        Else Append sRet ">"
+        
+        Function_Return sRet 
+    End_Function 
+    
+    Function toHTML tParsedHTML[] Parsed Returns String 
+        Integer iPos 
+        Integer iMax 
+        String sRet 
+        tParsedHTML HtmlFragment
+        
+        Move (SizeOfArray(Parsed)) to iMax
+        For iPos from 0 to (iMax-1)
+            Move parsed[iPos] to HtmlFragment
+            If (HtmlFragment.sType = "tag") Append sRet (TagToHtml(Self,HtmlFragment.tagdata))
+            If (HtmlFragment.sType = "text") Append sRet HtmlFragment.sValue
+        Loop
+        
+        Function_Return sRet 
+    End_Function 
+    
+    Function toFlatHTML tParsedHTML[] Parsed Returns String 
+        Integer iPos 
+        Integer iMax 
+        String sRet 
+        tParsedHTML HtmlFragment
+        
+        Move (SizeOfArray(Parsed)) to iMax
+        For iPos from 0 to (iMax-1)
+            Move parsed[iPos] to HtmlFragment
+            
+            If (HtmlFragment.sType = "tag") Append sRet (TagToHtml(Self,HtmlFragment.tagdata))
+            If (HtmlFragment.sType = "text") Append sRet HtmlFragment.sValue
+            Append sRet CR_LF
+        Loop
+        
+        Showln (Length(sRet))
+        Function_Return sRet 
+    End_Function 
+    
+End_Class