Commits

UENISHI Kota  committed 1bce5eb
  • Participants
  • Parent commits d3c2fa3

Comments (0)

Files changed (2)

File sitemapper.ml

+(* print all url in sitemap : you need xml-light and netclient *)
+
+open Http_client.Convenience (* to use this, link 'netclient' from OCAMLPACKS *)
+open Xml
+
+let get_xml url =
+  let s = http_get url in
+  print_string s;
+  flush stdout;;
+
+let get_root_url username = 
+  Printf.sprintf "http://%s.tumblr.com" username;;
+
+let get_top_sitemap root_url = 
+  Printf.sprintf "%s/sitemap.xml" root_url;;
+
+(* <sometag>
+   <loc>http://example.com</loc>
+   </sometag>  => [ "http://example.com" ] *)
+let get_loc_from_xml locallist sitemap_xml = 
+  let rec locs list lcs =
+    match lcs with 
+      | []-> list ;
+      | loc::remain->
+	  match Xml.tag loc with
+	    |"loc"->
+	       let urls = Xml.map Xml.pcdata loc in  (*	    prerr_endline (Xml.to_string loc); *)
+		 locs (urls @ list) remain ;
+	    | _->
+		locs list remain;
+  in
+    locs locallist (Xml.children sitemap_xml);;
+
+let gather_sitemap_urls username = 
+  let root_url = Printf.sprintf "http://%s.tumblr.com" username in
+  let top_sitemap_url = Printf.sprintf "%s/sitemap.xml" root_url in
+  let top_sitemap_xml = Xml.parse_string (http_get top_sitemap_url) in
+   (* print_endline (Xml.tag top_sitemap_xml); *)
+    begin match (Xml.tag top_sitemap_xml) with 
+      | "sitemapindex" ->
+	  Xml.fold get_loc_from_xml [] top_sitemap_xml;
+      | other ->
+	  prerr_endline ("PCData got. Not good..." ^ other);
+	  []
+    end;;
+
+(*
+ tumblr_sitemap sitemap_url = object (self)
+  val urls = 
+  method get_url = urls
+  method print_urls = 
+    let rec print list = 
+      match list with 
+	|[] -> (); 
+	|url::remain-> print_endline url; self#print remain
+  method contents = 
+    let get_content url =
+      Xml.parse_string (http_get url) in
+      List.map get_content urls
+end;; *)
+
+let rec gather_post_urls former sitemap_urls = 
+  match sitemap_urls with 
+    | [] -> former;
+    | sitemap_url::remain->
+	print_endline sitemap_url;
+	let urlset = Xml.children (Xml.parse_string (http_get sitemap_url)) in
+	let urls = List.flatten (List.map (get_loc_from_xml []) urlset )in
+	(*let con = sitemap#contents in*)
+	  gather_post_urls (urls @ former) (remain);;
+
+let rec print_urls list = 
+  match list with 
+    | [] -> print_endline "";
+    | url::remain -> 
+	print_endline url;
+	print_urls remain;;
+
+let sitemap_urls = gather_sitemap_urls "kuenishi" in
+let post_urls = gather_post_urls [] sitemap_urls in
+  print_urls post_urls;;
+
+(*let get_all_photos username = 
+  get_xml get_root_url username;; *)
+

File tumblr.ml

-open Http_client.Convenience (* to use this, link 'netclient' from OCAMLPACKS *)
-open Xml
-
-let get_xml url =
-  let s = http_get url in
-  print_string s;
-  flush stdout;;
-
-let get_root_url username = 
-  Printf.sprintf "http://%s.tumblr.com" username;;
-
-let get_top_sitemap root_url = 
-  Printf.sprintf "%s/sitemap.xml" root_url;;
-
-(* <sitemap>
-   <loc>http://example.com</loc>
-   </sitemap> *)
-let get_url_from_sitemap_xml locallist sitemap_xml = 
-  let rec locs list lcs =
-    match lcs with 
-      | []-> list ;
-      | loc::remain->
-	  let urls = Xml.map Xml.pcdata loc in
- (*	    prerr_endline (Xml.to_string loc); *)
-	    locs (urls @ list) remain
-  in
-    locs locallist (Xml.children sitemap_xml);;
-
-let gather_urls username = 
-  let root_url = Printf.sprintf "http://%s.tumblr.com" username in
-  let top_sitemap_url = Printf.sprintf "%s/sitemap.xml" root_url in
-  let top_sitemap_xml = Xml.parse_string (http_get top_sitemap_url) in
-   (* print_endline (Xml.tag top_sitemap_xml); *)
-    begin match (Xml.tag top_sitemap_xml) with 
-      | "sitemapindex" ->
-	  Xml.fold get_url_from_sitemap_xml [] top_sitemap_xml;
-      | other ->
-	  prerr_endline ("PCData got. Not good..." ^ other);
-	  []
-    end;;
-
-let rec print_urls list = 
-  match list with 
-    | [] -> print_endline "";
-    | url::remain -> 
-	print_endline url;
-	print_urls remain;;
-
-print_urls (gather_urls "kuenishi");;
-
-
-
-(*let get_all_photos username = 
-  get_xml get_root_url username;; *)
-