Commits

Anonymous committed d3c2fa3

getting sitemap<N>.xml done

Comments (0)

Files changed (1)

 let get_top_sitemap root_url = 
   Printf.sprintf "%s/sitemap.xml" root_url;;
 
+(* <sitemap>
+   <loc>http://example.com</loc>
+   </sitemap> *)
+let get_url_from_sitemap_xml locallist sitemap_xml = 
+  let rec locs list lcs =
+    match lcs with 
+      | []-> list ;
+      | loc::remain->
+	  let urls = Xml.map Xml.pcdata loc in
+ (*	    prerr_endline (Xml.to_string loc); *)
+	    locs (urls @ list) remain
+  in
+    locs locallist (Xml.children sitemap_xml);;
+
 let gather_urls username = 
   let root_url = Printf.sprintf "http://%s.tumblr.com" username in
   let top_sitemap_url = Printf.sprintf "%s/sitemap.xml" root_url in
   let top_sitemap_xml = Xml.parse_string (http_get top_sitemap_url) in
-    Printf.printf "XML formated = \n%s" (Xml.to_string_fmt top_sitemap_xml);
-    Xml.to_string_fmt top_sitemap_xml
-;;
+   (* print_endline (Xml.tag top_sitemap_xml); *)
+    begin match (Xml.tag top_sitemap_xml) with 
+      | "sitemapindex" ->
+	  Xml.fold get_url_from_sitemap_xml [] top_sitemap_xml;
+      | other ->
+	  prerr_endline ("PCData got. Not good..." ^ other);
+	  []
+    end;;
 
-print_endline (gather_urls "kuenishi");;
+let rec print_urls list = 
+  match list with 
+    | [] -> print_endline "";
+    | url::remain -> 
+	print_endline url;
+	print_urls remain;;
+
+print_urls (gather_urls "kuenishi");;