Anonymous avatar Anonymous committed a8a2237

Modified Files:
dom.c dom.h LibXML.xs LibXML.pm
+ added libxml dom recovering for the XML parsers.
prepared dom.c to recover chunks, too.

Changes
version notes

Comments (0)

Files changed (5)

 manpage!
 
 1.53
+   - implemented libxml2 dom recovering
    - introduced transparent XML::GDOME import. (EXPERIMENTAL)
    - calling external entity handlers work again
    - fixed a bug while replacing the document element.
     return $self->{XML_LIBXML_VALIDATION};
 }
 
+sub recover {
+    my $self = shift;
+    $self->{XML_LIBXML_RECOVER} = shift if scalar @_;
+    return $self->{XML_LIBXML_RECOVER};
+}
+
 sub expand_entities {
     my $self = shift;
     $self->{XML_LIBXML_EXPAND_ENTITIES} = shift if scalar @_;
 
 Turn validation on (or off). Defaults to off.
 
+=head2 recover
+
+  $parser->recover(1);
+
+Turn the parsers recover mode on (or off). Defaults to off.
+
+This allows to parse broken XML data into memory.  This switch will
+only work with XML data rather than HTML data. Also the validation
+will be switched off automaticly.
+
+The recover mode helps to recover documents that are almost wellformed
+very efficiently. That is for example a document that forgets to close
+the document tag (or any other tag inside the document). The recover
+mode of XML::LibXML has problems though to restore documents that are
+more like well ballanced chunks. In that case XML::LibXML will only
+parse the first tag of the chunk.
+
 =head2 expand_entities
 
   $parser->expand_entities(0);
 
 =item validation == off (0)
 
+=item recover == off (0)
+
 =item expand_entities == on (1)
 
 =item keep_blanks == on (1)
         xmlDocPtr real_dom;
         HV* real_obj = (HV *)SvRV(self);
         SV** item    = NULL;
+        int recover ;
     CODE:
         ptr = SvPV(string, len);
         if (len == 0) {
             real_dom->URL = xmlStrdup((const xmlChar*)directory);
         }
 
-        if (!well_formed
-            || (xmlDoValidityCheckingDefaultValue
-                && !valid 
-                && (real_dom->intSubset || real_dom->extSubset) ) ) {
+        item = hv_fetch( real_obj, "XML_LIBXML_RECOVER", 18, 0 );
+        recover = ( item != NULL && SvTRUE(*item) ) ? 1 : 0;
+        if ( ( !well_formed && !recover )
+               || (xmlDoValidityCheckingDefaultValue
+                    && !valid && !recover 
+                    && (real_dom->intSubset || real_dom->extSubset) ) ) {
             xmlFreeDoc(real_dom);
             RETVAL = &PL_sv_undef;    
             croak(SvPV(LibXML_error, len));
         xmlDocPtr real_dom;
         HV* real_obj = (HV *)SvRV(self);
         SV** item    = NULL;
+        int recover = 0;
     CODE:
         LibXML_error = NEWSV(0, 512);
         sv_setpvn(LibXML_error, "", 0);
         real_dom = LibXML_parse_stream(self, fh, directory);
         
         sv_2mortal(LibXML_error);
-        
+
+        item = hv_fetch( real_obj, "XML_LIBXML_RECOVER", 18, 0 );
+        recover = ( item != NULL && SvTRUE( *item ) ) ? 1 : 0;
+
         if (real_dom == NULL) {
             if ( SvCUR( LibXML_error ) > 0 ) {
                 croak(SvPV(LibXML_error, len));
         }
         else if (xmlDoValidityCheckingDefaultValue
                  && SvCUR(LibXML_error) > 0
-                 && (real_dom->intSubset || real_dom->extSubset)  ) {
+                 && (real_dom->intSubset || real_dom->extSubset) 
+                 && recover == 0 ) {
             croak(SvPV(LibXML_error, len));
         }
         else {
         xmlDocPtr real_dom = NULL;
         HV* real_obj = (HV *)SvRV(self);
         SV** item    = NULL;
+        int recover;
     CODE:
         LibXML_init_parser(self);
         ctxt = xmlCreateFileParserCtxt(filename);
             }
             XSRETURN_UNDEF;
         }
-        
-        if (!well_formed
-            || (xmlDoValidityCheckingDefaultValue
-                && (!valid
-                    || SvCUR(LibXML_error) > 0 )
-                && (real_dom->intSubset
-                    || real_dom->extSubset) )) {
+
+        item = hv_fetch( real_obj, "XML_LIBXML_RECOVER", 18, 0 );
+        recover = ( item != NULL && SvTRUE(*item) ) ? 1 : 0;
+
+        if (  ( !well_formed && !recover )
+               || (xmlDoValidityCheckingDefaultValue
+                   && !recover 
+                   && (!valid
+                       || SvCUR(LibXML_error) > 0 )
+                   && (real_dom->intSubset
+                       || real_dom->extSubset) )  ) {
             xmlFreeDoc(real_dom);
             croak("'%s'",SvPV(LibXML_error, len));
             XSRETURN_UNDEF;
         STRLEN len;
         HV* real_obj = (HV *)SvRV(self);
         SV** item    = NULL;
+        int recover;
     CODE:
         if ( encoding == NULL ) encoding = "UTF-8";
         ptr = SvPV(svchunk, len);
         chunk = Sv2C(svchunk, (const xmlChar*)encoding);
 
         if ( chunk != NULL ) {
+            item = hv_fetch( real_obj, "XML_LIBXML_RECOVER", 18, 0 );
+            recover = ( item != NULL && SvTRUE(*item) ) ? 1 : 0;
+           
             LibXML_error = sv_2mortal(newSVpv("", 0));
 
             LibXML_init_parser(self);
-            rv = domReadWellBalancedString( NULL, chunk );
+            rv = domReadWellBalancedString( NULL, chunk, recover );
             LibXML_cleanup_callbacks();
             LibXML_cleanup_parser();    
 
             if ( rv != NULL ) {
                 /* now we append the nodelist to a document
                    fragment which is unbound to a Document!!!! */
-                # warn( "good chunk, create fragment" );
                 item = hv_fetch( real_obj, "XML_LIBXML_GDOME", 16, 0 );
 
                 /* step 1: create the fragment */
                 }
             }
             else {
-                # warn( "bad chunk" );
+                warn( "bad chunk" );
                 croak(SvPV(LibXML_error, len));
                 XSRETURN_UNDEF;
             }
  *
  * in 99% the cases i believe it is faster than to create the dom by hand,
  * and skip the parsing job which has to be done here.
+ *
+ * the repair flag will not be recognized with the current libxml2
  **/
 xmlNodePtr 
-domReadWellBalancedString( xmlDocPtr doc, xmlChar* block ) {
+domReadWellBalancedString( xmlDocPtr doc, xmlChar* block, int repair ) {
     int retCode       = -1;
     xmlNodePtr nodes  = NULL;
     
                                                block,
                                                &nodes );
 
+/*         retCode = xmlParseBalancedChunkMemoryRecover( doc,  */
+/*                                                       NULL, */
+/*                                                       NULL, */
+/*                                                       0, */
+/*                                                       block, */
+/*                                                       &nodes, */
+/*                                                       repair ); */
+
         /* error handling */
-        if ( retCode != 0 ) {
+        if ( retCode != 0 && repair == 0 ) {
             /* if the code was not well balanced, we will not return 
              * a bad node list, but we have to free the nodes */
             xmlFreeNodeList( nodes );
+            nodes = NULL;
         }
         else {
             xmlSetListDoc(nodes,doc);
  **/
 
 xmlNodePtr 
-domReadWellBalancedString( xmlDocPtr doc, xmlChar* string );
+domReadWellBalancedString( xmlDocPtr doc, xmlChar* string, int repair );
 
 /**
  * NAME domIsParent
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.