Anonymous avatar Anonymous committed 154a45f

Modified Files:
LibXML.xs
o name testing uses a more libxml2 style for UTF8 character handling
(this one handles correct UTF8 chars)

dom.c dom.h
+ domParseChar()
parses an UTF8 character string and returns the UTF8 value and the
character length. (see the header file)

Comments (0)

Files changed (3)

 LibXML_test_node_name( xmlChar * name ) 
 {
     xmlChar * cur = name;
-    xmlChar * tc;
-    int t = 0;
+    int tc  = 0;
+    int len = 0; 
+
     if ( cur == NULL || *cur == 0 ) {
         return(0);
     }
 
-    tc = cur+UTF8SKIP(cur);
-    while (cur != tc) {
-        t = (t << 8) + *cur++;
-    }
-
-    if ( !( IS_LETTER( t ) || (t == '_') || (t == ':')) ) {
+    tc = domParseChar( cur, &len );
+
+    if ( !( IS_LETTER( tc ) || (tc == '_') || (tc == ':')) ) {
         return(0);
     }
-    t = 0;
-
-    while (*cur!=0 ) {
-        tc = cur+UTF8SKIP(cur);
-        while (cur != tc) {
-            t = (t << 8) + *cur++;
-        }
-
-        if (!(IS_LETTER(t) || IS_DIGIT(t) || (t == '_') || (t == '-') ||
-             (t == ':') || (t == '.') || IS_COMBINING(t) || IS_EXTENDER(t)) ) {
+
+    tc  =  0;
+    cur += len;
+
+    while (*cur != 0 ) {
+        tc = domParseChar( cur, &len );
+
+        if (!(IS_LETTER(tc) || IS_DIGIT(tc) || (tc == '_') ||
+             (tc == '-') || (tc == ':') || (tc == '.') ||
+             IS_COMBINING(tc) || IS_EXTENDER(tc)) ) {
             return(0);
         }
-        t = 0;
+        tc = 0;
+        cur += len;
     }
     
     return(1);
         XML::LibXML::DocumentFragment::appendText = 2
         XML::LibXML::DocumentFragment::appendTextNode = 3
     PREINIT:
-        xmlChar * content;
+        xmlChar * content = NULL;
     INIT:
         content = nodeSv2C( string, self );
         if ( content == NULL ) {
         xmlNodeAddContent( self, content );
         xmlFree(content);
 
+
 void
 appendTextChild( self, strname, strcontent=&PL_sv_undef, nsURI=&PL_sv_undef )
         xmlNodePtr self
 #endif
 
 /**
+ * NAME domParseChar
+ * TYPE function
+ * SYNOPSIS
+ *   int utf8char = domParseChar( curchar, &len );
+ *
+ * The current char value, if using UTF-8 this may actually span
+ * multiple bytes in the given string. This function parses an utf8
+ * character from a string into a UTF8 character (an integer). It uses
+ * a slightly modified version of libxml2's character parser. libxml2
+ * itself does not provide any function to parse characters dircetly
+ * from a string and test if they are valid utf8 characters.
+ *
+ * XML::LibXML uses this function rather than perls native UTF8
+ * support for two reasons:
+ * 1) perls UTF8 handling functions often lead to encoding errors,
+ *    which partly comes, that they are badly documented.
+ * 2) not all perl versions XML::LibXML intends to run with have native
+ *    UTF8 support.
+ *
+ * domParseChar() allows to use the very same code with all versions
+ * of perl :)
+ *
+ * Returns the current char value and its length
+ *
+ * NOTE: If the character passed to this function is not a UTF
+ * character, the return value will be 0 and the length of the
+ * character is -1!
+ */
+int
+domParseChar( xmlChar *cur, int *len ) 
+{
+    unsigned char c;
+	unsigned int val;
+
+	/*
+	 * We are supposed to handle UTF8, check it's valid
+	 * From rfc2044: encoding of the Unicode values on UTF-8:
+	 *
+	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
+	 * 0000 0000-0000 007F   0xxxxxxx
+	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
+	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
+	 *
+	 * Check for the 0x110000 limit too
+	 */
+    
+    if ( cur == NULL || *cur == 0 ) {
+        *len = 0;
+        return(0);
+    }
+    
+    c = *cur;
+    if ( c & 0x80 ) { 
+        if ((c & 0xe0) == 0xe0) {
+            if ((c & 0xf0) == 0xf0) {
+                /* 4-byte code */
+                *len = 4;
+                val = (cur[0] & 0x7) << 18;
+                val |= (cur[1] & 0x3f) << 12;
+                val |= (cur[2] & 0x3f) << 6;
+                val |= cur[3] & 0x3f;
+            } else {
+                /* 3-byte code */
+                *len = 3;
+                val = (cur[0] & 0xf) << 12;
+                val |= (cur[1] & 0x3f) << 6;
+                val |= cur[2] & 0x3f;
+            }
+	    } else {
+            /* 2-byte code */
+            *len = 2;
+            val = (cur[0] & 0x1f) << 6;
+            val |= cur[1] & 0x3f;
+	    }
+        if ( !IS_CHAR(val) ) {
+            *len = -1;
+            return(0);
+        }
+	    return(val);
+    }
+    else {
+        /* 1-byte code */
+	    *len = 1;
+        return((int)c); 
+    }
+}
+
+/**
  * Name: domReadWellBalancedString
  * Synopsis: xmlNodePtr domReadWellBalancedString( xmlDocPtr doc, xmlChar *string )
  * @doc: the document, the string should belong to
     }
     return(1);
 }
+
  * unsortet. 
  **/
 
+
+/**
+ * NAME domParseChar
+ * TYPE function
+ * SYNOPSIS
+ *   int utf8char = domParseChar( curchar, &len );
+ *
+ * The current char value, if using UTF-8 this may actually span
+ * multiple bytes in the given string. This function parses an utf8
+ * character from a string into a UTF8 character (an integer). It uses
+ * a slightly modified version of libxml2's character parser. libxml2
+ * itself does not provide any function to parse characters dircetly
+ * from a string and test if they are valid utf8 characters.
+ *
+ * XML::LibXML uses this function rather than perls native UTF8
+ * support for two reasons:
+ * 1) perls UTF8 handling functions often lead to encoding errors,
+ *    which partly comes, that they are badly documented.
+ * 2) not all perl versions XML::LibXML intends to run with have native
+ *    UTF8 support.
+ *
+ * domParseChar() allows to use the very same code with all versions
+ * of perl :)
+ *
+ * Returns the current char value and its length
+ *
+ * NOTE: If the character passed to this function is not a UTF
+ * character, the return value will be 0 and the length of the
+ * character is -1!
+ */
+int
+domParseChar( xmlChar *char, int *len );
+
 xmlNodePtr 
 domReadWellBalancedString( xmlDocPtr doc, xmlChar* string, int repair );
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.