Commits

Anonymous committed 506f090

HTML parsing mostly working

Comments (0)

Files changed (1)

 (import chicken scheme)
-(use comparse clojurian-syntax latch sxml-transforms irregex
-     srfi-1 srfi-14 data-structures char-set-literals html-parser)
+(use srfi-1 srfi-13 srfi-14 data-structures irregex
+     comparse clojurian-syntax latch sxml-transforms char-set-literals html-parser)
 
 (define (node el parser)
   (sequence* ((content parser))
               (_ (preceded-by space* (char-seq ticks))))
     (result code)))
 
+(define alphanumeric-ascii
+  (satisfies #[A-Za-z0-9]))
+
 ;; HexEntity =     < '&' '#' [Xx] [0-9a-fA-F]+ ';' >
 ;; DecEntity =     < '&' '#' [0-9]+ > ';' >
 ;; CharEntity =    < '&' [A-Za-z0-9]+ ';' >
                                         (one-or-more (satisfies #[0-9a-fA-F])))
                               (sequence (is #\#)
                                         (one-or-more (satisfies #[0-9])))
-                              (one-or-more (satisfies #[A-Za-z0-9])))
+                              (one-or-more alphanumeric-ascii))
                       (is #\;))
             (as-string))
         (o result html-strip)))
                   (as-string))))
     (result (cons 'blockquote (parse document text)))))
 
+
+
+(define html-block-elements
+  (append-map
+   (lambda (el)
+     (map char-seq (list el (string-upcase el))))
+   '("address" "blockquote" "center" "dir" "div" "dl" "fieldset" "form" "h1" "h2" "h3"
+     "h4" "h5" "h6" "hr" "isindex" "menu" "noframes" "noscript" "ol" "p" "pre" "table"
+     "ul" "dd" "dt" "frameset" "li" "tbody" "td" "tfoot" "th" "thead" "tr" "script" )))
+
+;; Quoted =        '"' (!'"' .)* '"' | '\'' (!'\'' .)* '\''
+
+(define quoted
+  (let ((delimiters (satisfies (char-set #\" #\'))))
+    (sequence* ((delimiter delimiters)
+                (text (zero-or-more (preceded-by (none-of (is delimiter)) item)))
+                (_ (is delimiter)))
+      (result text))))
+
+;; HtmlAttribute = (AlphanumericAscii | '-')+ Spnl ('=' Spnl (Quoted | (!'>' Nonspacechar)+))? Spnl
+
+(define html-attribute
+  (sequence* ((name (-> (any-of alphanumeric-ascii (is #\-))
+                        (one-or-more)
+                        (as-string)))
+              (_ space-new-line)
+              (value (maybe (preceded-by (is #\=)
+                                         space-new-line
+                                         (->> non-space-char
+                                              (preceded-by (none-of (is #\>)))
+                                              (one-or-more)
+                                              (any-of quoted)
+                                              (as-string)))))
+              (_ space-new-line))
+    (let ((name (string->symbol name)))
+      (if value
+          (result (list name value))
+          (result name)))))
+
+;; HtmlBlockOpenAddress = '<' Spnl ("address" | "ADDRESS") Spnl HtmlAttribute* '>'
+;; HtmlBlockCloseAddress = '<' Spnl '/' ("address" | "ADDRESS") Spnl '>'
+;; HtmlBlockAddress = HtmlBlockOpenAddress (HtmlBlockAddress | !HtmlBlockCloseAddress .)* HtmlBlockCloseAddress
+
+(define (html-block-close el)
+  (preceded-by (is #\<) space-new-line (is #\/)
+               (char-seq el) space-new-line (is #\>)))
+
+(define (html-element el attrs #!optional (contents '()))
+  (append (list 'html-element (string->symbol el))
+          (if (null? attrs)
+              '()
+              (list (cons '@ attrs)))
+          contents))
+
+(define html-block-element
+  (mutually-recursive-parser
+   (sequence* ((_ (preceded-by (is #\<) space-new-line))
+               (el (any-of html-block-elements))
+               (close (result (html-block-close el)))
+               (_ space-new-line)
+               (attrs (zero-or-more html-attribute))
+               (_ (is #\>))
+               (contents (->> (preceded-by (none-of close) item)
+                              (one-or-more)
+                              (as-string)
+                              (any-of (html-block-element))
+                              (zero-or-more)))
+               (_ close))
+     (result (html-element el attrs contents)))))
+
+;; HtmlComment =   "<!--" (!"-->" .)* "-->"
+
+(define html-comment
+  (sequence* ((_ (char-seq "<!--"))
+              (text (->> item
+                         (preceded-by (none-of (char-seq "-->")))
+                         (zero-or-more)
+                         (as-string)))
+              (_ (char-seq "-->")))
+    (result `(comment ,text))))
+
+;; HtmlBlockSelfClosing = '<' Spnl HtmlBlockType Spnl HtmlAttribute* '/' Spnl '>'
+
+(define html-block-self-closing
+  (sequence* ((_ (preceded-by (is #\<) space-new-line))
+              (el (any-of html-block-elements))
+              (_ space-new-line)
+              (attrs (zero-or-more html-attribute))
+              (_ (preceded-by (maybe (is #\/)) space-new-line (is #\>))))
+    (result (html-element el attrs))))
+
+;; HtmlBlock = < ( HtmlBlockInTags | HtmlComment | HtmlBlockSelfClosing ) >
+;;             BlankLine+
+;;             {   if (extension(EXT_FILTER_HTML)) {
+;;                     $$ = mk_list(LIST, NULL);
+;;                 } else {
+;;                     $$ = mk_str(yytext);
+;;                     $$->key = HTMLBLOCK;
+;;                 }
+;;             }
+
+(define html-block
+  (any-of (html-block-element)
+          html-comment
+          html-block-self-closing))
+
 (define block
   (preceded-by
    (zero-or-more blank-line)
            heading
            ordered-list
            bullet-list
+           html-block
            paragraph
            plain)))
 
                    `(em . ,text)))
     (strong . ,(lambda (_ text)
                  `(strong . ,text)))
+    (html-element . ,(lambda (_ contents)
+                       contents))
+    (comment . ,(lambda (_ contents)
+                  (list #\< "!--" contents "--" #\>)))
     . ,alist-conv-rules*))
 
 (define (ref->alist-entry ref)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.