Commits

Shlomi Fish committed 92a16a8

Feature: joining congruent character data together in SAX driver.

Apply the patch from https://rt.cpan.org/Ticket/Display.html?id=52368 .

Comments (0)

Files changed (4)

         - Adds colours and stuff like that.
     - Add << LICENSE => 'perl' >> to the Makefile.PL for a license
     meta-data in the META.YML.
+    - Feature implementation: joining congruent character data together in
+    SAX driver .
+        - Apply a somewhat modified patch from:
+            - https://rt.cpan.org/Ticket/Display.html?id=52368
 
 1.74            Thu Jun 23 16:20:42 IDT 2011
     - More work on the t/*.t test scripts.
             <para>Additionally to the generic functions, which are only able to process entire documents, XML::LibXML::SAX provides <emphasis>parse_chunk()</emphasis>.
             This method generates SAX events from well balanced data such as is often provided by databases.</para>
         </sect1>
+
+        <sect1>
+            <title>Features</title>
+
+            <para><emphasis>NOTE:</emphasis> This feature is experimental. </para>
+
+            <para>You can enable character data joining which may yield a 
+            significant speed boost in your XML processing in lower markup
+            ratio situations by enabling the 
+            http://xmlns.perl.org/sax/join-character-data feature of this 
+            parser. This is done via the set_feature method like 
+            this: 
+            </para> 
+            
+            <programlisting>$p->set_feature('http://xmlns.perl.org/sax/join-character-data', 1); 
+            </programlisting>
+
+            <para>
+            You can also specify a 0 to disable. The default is to have
+            this feature disabled. 
+            </para>
+        </sect1>
     </chapter>
 
     <chapter id="XML-LibXML-SAX-Builder">

lib/XML/LibXML/SAX.pm

   return $XML::LibXML::__threads_shared ? 0 : 1;
 }
 
+sub set_feature {
+	my ($self, $feat, $val) = @_;
+	
+	if ($feat eq 'http://xmlns.perl.org/sax/join-character-data') {
+		$self->{JOIN_CHARACTERS} = $val;
+		return 1;
+	}
+	
+	shift(@_);
+	return $self->SUPER::set_feature(@_);
+}
+
 sub _parse_characterstream {
     my ( $self, $fh ) = @_;
     # this my catch the xml decl, so the parser won't get confused about
 
 sub _parse_bytestream {
     my ( $self, $fh ) = @_;
+ 
     $self->{ParserOptions}{LibParser}      = XML::LibXML->new;
     $self->{ParserOptions}{ParseFunc}      = \&XML::LibXML::parse_fh;
     $self->{ParserOptions}{ParseFuncParam} = $fh;
 sub _parse {
     my $self = shift;
     my $args = bless $self->{ParserOptions}, ref($self);
+    
+    if (defined($self->{JOIN_CHARACTERS})) {
+    	$args->{LibParser}->{JOIN_CHARACTERS} = $self->{JOIN_CHARACTERS};
+    } else {
+    	$args->{LibParser}->{JOIN_CHARACTERS} = 0;
+    }
 
     $args->{LibParser}->set_handler( $self );
     eval {
     return;
 }
 
-
 1;
 

perl-libxml-sax.c

     xmlDocPtr ns_stack_root;
     SV * handler;
     SV * saved_error;
+    struct CBuffer *charbuf;
+    int joinchars;
 } PmmSAXVector;
 
 typedef PmmSAXVector* PmmSAXVectorPtr;
 
+struct CBufferChunk {
+	struct CBufferChunk *next;
+	xmlChar *data;
+	int len;
+};
+
+struct CBuffer {
+	struct CBufferChunk *head;
+	struct CBufferChunk *tail;
+};
+
 static U32 PrefixHash; /* pre-computed */
 static U32 NsURIHash;
 static U32 NameHash;
     return retval;
 }
 
-
 void
 PmmSAXInitialize(pTHX)
 {
 }
 
 xmlSAXHandlerPtr PSaxGetHandler();
+int PSaxCharactersFlush(void *, struct CBuffer *);
+
+
+/* Character buffering functions */
+
+struct CBufferChunk * CBufferChunkNew(void) {
+	struct CBufferChunk *newchunk = xmlMalloc(sizeof(struct CBufferChunk));
+	memset(newchunk, 0, sizeof(struct CBufferChunk));
+	return newchunk;
+}
+
+struct CBuffer * CBufferNew(void) {
+	struct CBuffer *new = xmlMalloc(sizeof(struct CBuffer));
+	struct CBufferChunk *newchunk = CBufferChunkNew();
+
+	memset(new, 0, sizeof(struct CBuffer));
+
+	new->head = newchunk;
+	new->tail = newchunk;
+
+	return new;
+}
+
+void CBufferPurge(struct CBuffer *buffer) {
+	struct CBufferChunk *p1;
+	struct CBufferChunk *p2;
+
+	if (buffer == NULL || buffer->head->data == NULL) {
+		return;
+	}
+
+	if (p1 = buffer->head) {
+
+		while(p1) {
+			p2 = p1->next;
+
+			if (p1->data) {
+				xmlFree(p1->data);
+			}
+
+			xmlFree(p1);
+
+			p1 = p2;
+		}
+	}
+
+	buffer->head = CBufferChunkNew();
+	buffer->tail = buffer->head;
+}
+
+void CBufferFree(struct CBuffer *buffer) {
+	struct CBufferChunk *p1;
+	struct CBufferChunk *p2;
+
+	if (buffer == NULL) {
+		return;
+	}
+
+	if (p1 = buffer->head) {
+
+		while(p1) {
+			p2 = p1->next;
+
+			if (p1->data) {
+				xmlFree(p1->data);
+			}
+
+			xmlFree(p1);
+
+			p1 = p2;
+		}
+	}
+
+	xmlFree(buffer);
+
+	return;
+}
+
+int CBufferLength(struct CBuffer *buffer) {
+	int length = 0;
+	struct CBufferChunk *cur;
+
+	for(cur = buffer->head; cur; cur = cur->next) {
+		length += cur->len;
+	}
+
+	return length;
+}
+
+void CBufferAppend(struct CBuffer *buffer, const xmlChar *newstring, int len) {
+	char *copy = xmlMalloc(len);
+
+	memcpy(copy, newstring, len);
+
+	buffer->tail->data = copy;
+	buffer->tail->len = len;
+	buffer->tail->next = CBufferChunkNew();
+	buffer->tail = buffer->tail->next;
+}
+
+xmlChar * CBufferCharacters(struct CBuffer *buffer) {
+	int length = CBufferLength(buffer);
+	xmlChar *new = xmlMalloc(length + 1);
+	char *p = new;
+	int copied = 0;
+	struct CBufferChunk *cur;
+
+	if (buffer->head->data == NULL) {
+		return NULL;
+	}
+
+	for(cur = buffer->head;cur;cur = cur->next) {
+		if (! cur->data) {
+			continue;
+		}
+
+		if ((copied = copied + cur->len) > length) {
+			fprintf(stderr, "string overflow\n");
+			abort();
+		}
+
+		memcpy(p, cur->data, cur->len);
+		p += cur->len;
+	}
+
+	new[length] = '\0';
+
+	return new;
+}
+
+/* end character buffering functions */
 
 
 void
 {
     PmmSAXVectorPtr vec = NULL;
     SV ** th;
+    SV ** joinchars;
+
     dTHX;
 
     CLEAR_SERROR_HANDLER
         vec->handler = SvREFCNT_inc(*th)  ;
     }
     else {
-        vec->handler = NULL  ;
+        vec->handler = NULL;
+    }
+
+    joinchars = hv_fetch((HV*)SvRV(parser), "JOIN_CHARACTERS", 15, 0);
+
+    if (joinchars != NULL) {
+    	vec->joinchars = (SvIV(*joinchars));
+    } else {
+    	vec->joinchars = 0;
+    }
+
+    if (vec->joinchars) {
+        vec->charbuf = CBufferNew();
+    } else {
+    	vec->charbuf = NULL;
     }
 
     if ( ctxt->sax ) {
         vec->handler = NULL;
     }
 
+    CBufferFree(vec->charbuf);
+    vec->charbuf = NULL;
+
     xmlFree( ctxt->sax );
     ctxt->sax = NULL;
 
     xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
     PmmSAXVectorPtr  sax  = (PmmSAXVectorPtr)ctxt->_private;
 
+    if (sax->joinchars)
+    {
+        PSaxCharactersFlush(ctxt, sax->charbuf);
+    }
+
     dTHX;
     dSP;
 
     SV * rv;
     SV * arv;
 
+    if (sax->joinchars)
+    {
+        PSaxCharactersFlush(ctxt, sax->charbuf);
+    }
+
     dSP;
     
     ENTER;
     SV * rv;
     HV * element;
 
+    if (sax->joinchars)
+    {
+        PSaxCharactersFlush(ctxt, sax->charbuf);
+    }
+
     dSP;
 
     ENTER;
 }
 
 int
-PSaxCharacters(void *ctx, const xmlChar * ch, int len) {
+PSaxCharactersDispatch(void *ctx, const xmlChar * ch, int len) {
     xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
     PmmSAXVectorPtr sax = (PmmSAXVectorPtr)ctxt->_private;
     dTHX;
 
         if (SvTRUE(ERRSV)) {
             croak_obj;
-	}
-        
+        }
         FREETMPS ;
         LEAVE ;
 
     return 1;
 }
 
+int PSaxCharactersFlush (void *ctx, struct CBuffer *buffer) {
+    xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
+    PmmSAXVectorPtr sax = (PmmSAXVectorPtr)ctxt->_private;
+    xmlChar *ch;
+    int len;
+
+    if (buffer->head->data == NULL) {
+        return 1;
+    }
+
+    ch = CBufferCharacters(sax->charbuf);
+    len = CBufferLength(sax->charbuf);
+
+    CBufferPurge(buffer);
+
+    return PSaxCharactersDispatch(ctx, ch, len);
+}
+
+int PSaxCharacters (void *ctx, const xmlChar * ch, int len) {
+    xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
+    PmmSAXVectorPtr sax = (PmmSAXVectorPtr)ctxt->_private;
+
+    if (sax->joinchars) {
+        struct CBuffer *buffer = sax->charbuf;
+        CBufferAppend(buffer, ch, len);
+        return 1;
+    }
+
+    return PSaxCharactersDispatch(ctx, ch, len);
+}
+
 int
 PSaxComment(void *ctx, const xmlChar * ch) {
     xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
     if ( ch != NULL && handler != NULL ) {
         int len = xmlStrlen( ch );
 
+        if (sax->joinchars)
+        {
+            PSaxCharactersFlush(ctxt, sax->charbuf);
+        }
+
         dSP;
 
         ENTER;
 
     if ( ch != NULL && handler != NULL ) {
 
+        if (sax->joinchars)
+        {
+            PSaxCharactersFlush(ctxt, sax->charbuf);
+        }
+
         dSP;
 
         ENTER;
     SV * rv = NULL;
 
     if ( handler != NULL ) {
+        if (sax->joinchars)
+        {
+            PSaxCharactersFlush(ctxt, sax->charbuf);
+        }
+
         dSP;
     
         ENTER;