Source

tumblr-tools / sitemapper.ml

Full commit
(* print all url in sitemap : you need xml-light and netclient *)

open Http_client.Convenience (* to use this, link 'netclient' from OCAMLPACKS *)
open Xml

let get_xml url =
  let s = http_get url in
  print_string s;
  flush stdout;;

let get_root_url username = 
  Printf.sprintf "http://%s.tumblr.com" username;;

let get_top_sitemap root_url = 
  Printf.sprintf "%s/sitemap.xml" root_url;;

(* <sometag>
   <loc>http://example.com</loc>
   </sometag>  => [ "http://example.com" ] *)
let get_loc_from_xml locallist sitemap_xml = 
  let rec locs list lcs =
    match lcs with 
      | []-> list ;
      | loc::remain->
	  match Xml.tag loc with
	    |"loc"->
	       let urls = Xml.map Xml.pcdata loc in  (*	    prerr_endline (Xml.to_string loc); *)
		 locs (urls @ list) remain ;
	    | _->
		locs list remain;
  in
    locs locallist (Xml.children sitemap_xml);;

let gather_sitemap_urls username = 
  let root_url = Printf.sprintf "http://%s.tumblr.com" username in
  let top_sitemap_url = Printf.sprintf "%s/sitemap.xml" root_url in
  let top_sitemap_xml = Xml.parse_string (http_get top_sitemap_url) in
   (* print_endline (Xml.tag top_sitemap_xml); *)
    begin match (Xml.tag top_sitemap_xml) with 
      | "sitemapindex" ->
	  Xml.fold get_loc_from_xml [] top_sitemap_xml;
      | other ->
	  prerr_endline ("PCData got. Not good..." ^ other);
	  []
    end;;

(*
 tumblr_sitemap sitemap_url = object (self)
  val urls = 
  method get_url = urls
  method print_urls = 
    let rec print list = 
      match list with 
	|[] -> (); 
	|url::remain-> print_endline url; self#print remain
  method contents = 
    let get_content url =
      Xml.parse_string (http_get url) in
      List.map get_content urls
end;; *)

let rec gather_post_urls former sitemap_urls = 
  match sitemap_urls with 
    | [] -> former;
    | sitemap_url::remain->
	print_endline sitemap_url;
	let urlset = Xml.children (Xml.parse_string (http_get sitemap_url)) in
	let urls = List.flatten (List.map (get_loc_from_xml []) urlset )in
	(*let con = sitemap#contents in*)
	  gather_post_urls (urls @ former) (remain);;

let rec print_urls list = 
  match list with 
    | [] -> print_endline "";
    | url::remain -> 
	print_endline url;
	print_urls remain;;

let sitemap_urls = gather_sitemap_urls "kuenishi" in
let post_urls = gather_post_urls [] sitemap_urls in
  print_urls post_urls;;

(*let get_all_photos username = 
  get_xml get_root_url username;; *)