Commits

Marcus Pope  committed 8988456

Fixed a bug in array.proto.random. Added a bunch of html parsing functions and bug fixes. Added uri object feature to http get for tor access. Added some test code for win32ole wrappers in nodejs... #crazy

  • Participants
  • Parent commits 21643c0

Comments (0)

Files changed (6)

File lib/array.prototype.js

 
     Array.prototype.random = function(cnt) {
         //returns N random values from an array
-        if (cnt >= this.length) return this;
+        if (cnt >= this.length || !this.length) return this;
 
-        return (cnt || 1).iterate(function(i) {
-            return this[rand(0, this.length)];
-        }).trueType();
+        var a = this;
+        return (cnt || 1).iterate(function(i){
+            return a[Math.random(0, a.length - 1)];
+        });
     };
 
     Array.prototype.surrounding =

File lib/function.prototype.js

 
             self.precedence = function(a, b) {
                 //compare indicies in order of precedence
-                return self.order[a.getPunctuation() || a.length == 0 || "none"] - self.order[b.getPunctuation() || b.length == 0 || "none"];
+                return self.order[a.punctuation() || a.length == 0 || "none"] - self.order[b.punctuation() || b.length == 0 || "none"];
             };
         }
 
 //@todo: add logic for file uploads
 
 new (function() {
-    
+
     var self = this;
 
     //bifurcate http/https node libs :(
                    'https:' : require('https') },
         url = require('url'),
         qs = require('querystring');
-        
+
     var get = function(uri, cb, binary) {
-        
-        var parts = url.parse(uri);
-        
+
+        if (typeof uri == "string") {
+            var parts = url.parse(uri);
+        }
+        else {
+            //uri can be an object, which is used for things like proxied requests
+            parts = uri;
+        }
+
         //http/https
-        return scheme[parts.protocol].get(url.parse(uri), function(r) {
+        return scheme[parts.protocol || "http:"].get(parts, function(r) {
             var data = '';
-            
+
             if (binary) {
                 //@todo: can we autodetect this flag?
                 r.setEncoding('binary');
             }
-            
+
             r.on('error', cb);
-          
+
             r.on('data', function(hunk) {
                 //@todo: stream to filesystem on server side, so we don't consume massive amounts of memory
                 //for large downloads
                 data += hunk;
             });
-          
+
             r.on('end', function() {
                 cb(data);
             });
         });
-                       
-        
+
     };
-    
+
     self.get = function(uri, cb) {
-        //http get request - async only for now 
+        //http get request - async only for now
         //uri = uri.toUri(); @todo: create String.proto.toUri to support things like www.google.com
-        return get(uri, cb); 
+        return get(uri, cb);
     };
-    
+
     self.binary = function(uri, filename, cb) {
         //Download a binary file from the internet (image, zip, mp3 etc)
         //optionally specify a target filename.
                 fs.load();
                 filename = fs.temp(null, parts.pathname.extension());
             }
-        
+
             file.binary(filename, data, function(err){
                 //return filepath or error
                 if (err) return cb(err);
             });
         }, true); //set binary flag
     };
-    
+
     self.post = function(uri, data, cb) {
         //http post request
-        
+
         //encode data into querystring
         if (!data.isa('string')) {
             //@todo: verify key : array value pairs are converted properly
             data = qs.stringify(data);
         }
-        
+
         //parse url and setup post headers
         var parts = url.parse(uri);
             parts.method = 'POST';
                 'Content-Type' : 'application/x-www-form-urlencoded',
                 'Content-Length' : data.length
             };
-        
+
         //create request object
         var req = scheme[parts.protocol].request(parts, function(r) {
             //handle response
             var data = "";
-            
+
             r.addListener('data', function(hunk) {
-                data += hunk;   
+                data += hunk;
             });
-            
+
             r.addListener('end', function() {
                 //send data to callback
                 cb(data);
-            });                
-            
+            });
+
         });
-        
+
         req.on('error', function(e) {
             //or send the error to callback
             cb(e);
         });
-        
+
         //push the data and end the request
         req.write(data);
         req.end();
     };
-    
+
     self.upload = function() {
-        Error.notImplemented().toss();   
+        Error.notImplemented().toss();
     };
-    
+
     if (typeof module != "undefined") {
-        module.exports = self; 
+        module.exports = self;
     }
-    
+
     return self;
 })

File lib/string.prototype.js

             replace(/\ {4}/g, "\t").
             replace(/\ /g, " ").
             replace(/\'/g, "'").
+            replace(/\'/g, "'").
             replace(/<br\/>/g, "\n");
     };
 
     };
 
     String.prototype.paragraphs = function() {
-        return this.split(/((\r\n){2,}|(\s){4,})/g);
+        return this.split(/((\r|\n){2,}|(\s){4,})/g);
     };
 
     String.prototype.words = function() {
 
     })(String.prototype.replace);
 
+    (function(omatch) {
+        String.prototype.match = function(regexp, cb) {
+
+            var b = omatch.apply(this, [regexp]);
+
+            if (cb && b) {
+                cb.apply(this, b);
+            }
+
+            return b;
+        };
+    })(String.prototype.match);
+
     String.prototype.unescape = function() {
         return unescape(this);
     };
         return this.toNumber().toRad();
     };
 
+    // @todo: let's group html specific string proto extensions into a sub module we autoload if needed
+    String.prototype.parseAnchors = function(cb) {
+        return this.replace(/<a (.*?)>(.*?)<\/a>/gi, function(all, attr, text, pos) {
+            //replace anchors with textual link references
+            var url = ((attr.match(/href=['"](.*?)['"]/i) || "")[1] || "").toLower();
+
+            return cb(url, text, all);
+        });
+    };
+
+    String.prototype.cleanAnchors = function(origin) {
+        //removes anchors that are not links to resources
+        return this.parseAnchors(function(url, text, all) {
+            //no href? no soup...
+            if (!url || url == "#" || url.startsWith('javascript:') || url.startsWith('mailto:')) return "";
+
+            return all;
+        });
+    };
+
+    String.prototype.stripAnchors = function(origin) {
+        //removes anchor tags, replaces them with textual representations (markdown) of the link
+        return this.parseAnchors(function(url, text) {
+            //normalize relative/root urls to absolute paths
+            if (origin) {
+                var host, path;
+
+                origin.split('/').use(function() {
+                    host = this.before(4).join('/');
+                    path = this.slice(0, -1).join('/');
+                });
+
+                if (url.startsWith('/')) {
+                    url = host + url;
+                }
+                if (url.startsWith('.')) {
+                    url = path + url;
+                }
+            }
+
+            return "[" + text + "](" + url + ")";
+        });
+    };
+
+    String.prototype.cleanImages = function() {
+        return this.replace(/<img (.*?)\/?>/gi, function(all, attr, pos) {
+            var alt = (attr.match(/alt=['"](.*?)['"]/i) || "")[1];
+
+            if (alt) {
+                return "[image: " + alt + "]";
+            }
+            else {
+                return "[image]";
+            }
+        });
+    };
+
     String.prototype.prepHTML =
     String.prototype.prepHtml = function() {
         //Prep html inserts non breaking spaces in key parts of html so that calling innerText on the resulting
         //dom node will not merge two string parts together
-        return this.replace(/<br\s?\/?>/gi, '\n').replace(/<\/p>/i, "\n\n").replace(/<\/li>/i, "\n\n").replace(/<\/(.*?)>/g, function(v) {
+        return this.replace(/<br\s?\/?>/gi, '\n').replace(/<\/(p|li|title)>/i, "\n\n").replace(/<\/(.*?)>/g, function(v) {
             return "&nbsp;" + v;
         });
     };
 
     String.prototype.stripXml =
-    String.prototype.stripHtml = function(ie_opt, tag_opt) {
-        //@todo: implement 3rd party dom parser
-        tag_opt = def(tag_opt, "div");
+    String.prototype.stripHtml = function(url, ie_opt, tag_opt) {
+        var res = "";
 
-        var obj = ie_opt.document.createElement(tag_opt);
+        if (!ie_opt) {
+            res = this.removeDoctype().removeScripts().removeCSS().cleanImages().cleanAnchors().stripAnchors(url).prepHtml().stripTags();
+        }
+        else {
+            tag_opt = def(tag_opt, "div");
+
+            var obj = ie_opt.document.createElement(tag_opt);
             obj.innerHTML = this.prepHtml();
-        var txt = obj.innerText
+            res = obj.innerText
             obj = null;
+        }
 
-        return txt;
+        return res;
+        // @todo: consider something along these lines...
+        return res.lines().ea(function(v) { return v.trim(); }).join("\n");
     };
 
     String.prototype.isHtml = function() {
         return (this.stripTags() != this);
     };
 
+    String.prototype.removeDoctype = function() {
+        return this.replace(/<!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>/ig, "");
+    };
+
+    String.prototype.removeMetaRedirect = function() {
+        return this.replace(/<meta [^>]*?http-equiv=("|'|)refresh("|'|) /gi, '<meta ');
+    };
+
     String.prototype.stripTags = function(allowed) {
         // Strips HTML and PHP tags from a string
 
 
     String.prototype.removeScripts = function() {
         //removes any script tags from this string
-        var block = false;
-        return this.lines().ea(function(txt) {
-            if (!block) {
-                if ( txt.contains("<" + "script") &&
-                    !txt.contains("src=")) {
-                    block = true;
-                }
-            } else {
-                if (txt.contains("</" + "script>")) {
-                    block = false;
-                } else {
-                    return txt;
-                }
-            }
-        }).join("\r\n");
+        return this.
+            replace(/<script[^>]*>([\s\S]+?)<\/script>/gi, function(r) {
+                return " ";
+            });
     };
 
     String.prototype.cssTags = function() {
 
     String.prototype.removeCSS = function() {
         //removes any style tags from this string
-        var block = false;
-        return this.lines().ea(function(txt) {
-            if (!block) {
-                if ( txt.contains("<" + "style") &&
-                    !txt.contains("href=")) {
-                    block = true;
-                }
-            } else {
-                if (txt.contains("</" + "style>")) {
-                    block = false;
-                } else {
-                    return txt;
-                }
-            }
-        }).join("\r\n");
+        return this.
+            replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, " ").
+            replace(/<link[\s\S]+?\/>/gi, function(r) {
+                return " ";
+            });
     };
 
     String.prototype.between = function(left, right) {
         return this.without(/[^a-z0-9 ]/i);
     };
 
-    String.prototype.getPunctuation = function() {
+    String.prototype.punctuation = function() {
         //return all punctuation from this string
         return this.match(/[\*\+\=\.\,\;\:\?\!\-\(\)\"\'\&\^\%\$\#\@\`\~\[\]\{\}\\\|\<\>\/]/g);
     };

File test/playground.js

 
 return;
 
-//logger tests
+//Contemplating a second approach for win32 only, but it would provide us with a fully copmliant Browser Object Model
+//instead of a DOMv1 only spec.  But the node module win32ole has some quirks that would need serious consideration
+//and precaution.  ie strings returned from activexobjects do not inherit from string.proto, and would need to be cast
+//with toString() calls to fix this issue.  Other oddities like checking booleans in truthy ways will not work either without
+//some special hacks implemented by win32ole.
+require('win32ole');
+
+var ie = new ActiveXObject('InternetExplorer.Application');
+
+ie.navigate('http://www.marcuspope.com/');
+
+setTimeout(function waitForReady() {
+    if (ie.document.readyState == "complete") {
+       console.log(ie.document.body.innerHTML.toString().paragraphs().clean()[5]);
+    }
+    else {
+        setTimeout(waitForReady, 500);
+    }
+}, 500);
+
+console.log('yerp');
+return;//logger tests
 //log().levels('debug info error');
 //log('test');
 //log(new Error('hrm, this error stinks'));
 var uri = "https://a248.e.akamai.net/assets.github.com/images/modules/header/logov7-hover.png?1324325453";
 
 http.binary(uri, "c:\\github.png", function(r) {
-    alert(r);    
+    alert(r);
 });
 
 http.get('http://localhost:8080/', function(res) {
-    alert(res.length);    
+    alert(res.length);
 });
 
 http.post('http://localhost:2000/', {

File web/verboten.js

                 replace(/\&nbsp;{4}/g, "\t").
                 replace(/\&nbsp;/g, " ").
                 replace(/\&apos;/g, "'").
+                replace(/\&#39;/g, "'").
                 replace(/<br\/>/g, "\n");
         };
     
         };
     
         String.prototype.paragraphs = function() {
-            return this.split(/((\r\n){2,}|(\s){4,})/g);
+            return this.split(/((\r|\n){2,}|(\s){4,})/g);
         };
     
         String.prototype.words = function() {
             return this.toNumber().toRad();
         };
     
+        // @todo: let's group html specific string proto extensions into a sub module we autoload if needed
         String.prototype.prepHTML =
-        String.prototype.prepHtml = function() {
+        String.prototype.prepHtml = function(origin) {
             //Prep html inserts non breaking spaces in key parts of html so that calling innerText on the resulting
             //dom node will not merge two string parts together
-            return this.replace(/<br\s?\/?>/gi, '\n').replace(/<\/p>/i, "\n\n").replace(/<\/li>/i, "\n\n").replace(/<\/(.*?)>/g, function(v) {
+            return this.replace(/<a (.*?)>(.*?)<\/a>/gi, function(all, attr, text, pos) {
+                //replace anchors with textual link references
+                var url = (attr.match(/href=['"](.*?)['"]/i) || "")[1];
+    
+                //no href? let strip tags handle the anchor
+                if (!url) return all;
+    
+                //normalize relative/root urls to absolute paths
+                if (origin) {
+                    var host, path;
+    
+                    origin.split('/').use(function() {
+                        host = this.before(4).join('/');
+                        path = this.slice(0, -1).join('/');
+                    });
+    
+                    if (url.startsWith('/')) {
+                        url = host + url;
+                    }
+                    if (url.startsWith('.')) {
+                        url = path + url;
+                    }
+                }
+    
+                return "[" + text + "](" + url + ")";
+            }).replace(/<img (.*?)\/?>/gi, function(all, attr, pos) {
+                var alt = (attr.match(/alt=['"](.*?)['"]/i) || "")[1];
+    
+                if (alt) {
+                    return "[image: " + alt + "]";
+                }
+    
+                return all;
+            }).replace(/<br\s?\/?>/gi, '\n').replace(/<\/(p|li|title)>/i, "\n\n").replace(/<\/(.*?)>/g, function(v) {
                 return "&nbsp;" + v;
             });
         };
     
         String.prototype.stripXml =
-        String.prototype.stripHtml = function(ie_opt, tag_opt) {
-            //@todo: implement 3rd party dom parser
-            tag_opt = def(tag_opt, "div");
-    
-            var obj = ie_opt.document.createElement(tag_opt);
+        String.prototype.stripHtml = function(url, ie_opt, tag_opt) {
+            var res = "";
+    
+            if (!ie_opt) {
+                res = this.removeDoctype().removeScripts().removeCSS().prepHTML(url).stripTags();
+            }
+            else {
+                tag_opt = def(tag_opt, "div");
+    
+                var obj = ie_opt.document.createElement(tag_opt);
                 obj.innerHTML = this.prepHtml();
-            var txt = obj.innerText
+                res = obj.innerText
                 obj = null;
-    
-            return txt;
+            }
+    
+            return res;
+            // @todo: consider something along these lines...
+            return res.lines().ea(function(v) { return v.trim(); }).join("\n");
         };
     
         String.prototype.isHtml = function() {
             return (this.stripTags() != this);
         };
     
+        String.prototype.removeDoctype = function() {
+            return this.replace(/<!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>/ig, "");
+        };
+    
         String.prototype.stripTags = function(allowed) {
             // Strips HTML and PHP tags from a string
     
     
         String.prototype.removeScripts = function() {
             //removes any script tags from this string
-            var block = false;
-            return this.lines().ea(function(txt) {
-                if (!block) {
-                    if ( txt.contains("<" + "script") &&
-                        !txt.contains("src=")) {
-                        block = true;
-                    }
-                } else {
-                    if (txt.contains("</" + "script>")) {
-                        block = false;
-                    } else {
-                        return txt;
-                    }
-                }
-            }).join("\r\n");
+            return this.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, " ");
         };
     
         String.prototype.cssTags = function() {
     
         String.prototype.removeCSS = function() {
             //removes any style tags from this string
-            var block = false;
-            return this.lines().ea(function(txt) {
-                if (!block) {
-                    if ( txt.contains("<" + "style") &&
-                        !txt.contains("href=")) {
-                        block = true;
-                    }
-                } else {
-                    if (txt.contains("</" + "style>")) {
-                        block = false;
-                    } else {
-                        return txt;
-                    }
-                }
-            }).join("\r\n");
+            return this.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, " ");
         };
     
         String.prototype.between = function(left, right) {