Commits

Anonymous committed fbeea54

initial commit

Comments (0)

Files changed (3)

+#!/usr/bin/perl
+
+# fork.pl
+# (c) 2010 Alexandr A Alexeev
+# http://eax.me/
+
+use strict;
+use Socket;
+use IO::Select;
+use IO::Handle;
+use constant DEBUG => 0;
+
+my $selread = IO::Select->new();
+my $selwrite = IO::Select->new();
+
+my $cmd = shift;
+my $childs = abs(shift);
+my $total_lines = abs(shift);
+my $percent_mult = $total_lines ? 100/$total_lines : 0; # номер строки -> проценты
+die "Usage: $0 <cmd> [childs=8]\n" unless($cmd);
+
+$childs = 8 unless($childs);
+ dbg_print("cmd = '$cmd', childs = $childs");
+
+for my $num (1..$childs) {
+   dbg_print("starting child #$num...");
+   my ($hchild, $hparent, $childid);
+   socketpair($hchild, $hparent, AF_UNIX, SOCK_STREAM, PF_UNSPEC)
+     or die "socketpair: $!";
+
+   $childid = fork;
+   die "cannot fork" if($childid == -1);
+
+   # перенаправляем ввод/вывод потомка в сокет
+   unless($childid) {
+     # потомок
+     open STDIN, "<&", $hparent;
+     open STDOUT, ">&", $hparent;
+     open STDERR, ">&", $hparent;
+     close $hparent;
+     close $hchild;
+     # унаследованные хэндлы следует закрыть
+     for my $h($selread->handles) {
+       $selread->remove($h);
+       $selwrite->remove($h);
+       close $h;
+     }
+     exec $cmd;
+   }
+
+   close $hparent;
+   $selread->add($hchild);
+   $selwrite->add($hchild);
+}
+
+ dbg_print("All done, now working...");
+
+# "ошибка" может произойти в случае заполнения буффера ввода,
+# потому это событие мы не отслеживаем
+my $line_number = 0;
+my $old_percent = 0;
+while(($childs) && (my($read, $write, undef) = 
+         IO::Select::select($selread, $selwrite, undef))) {
+   # можем читать из сокета
+   for my $h(@{$read}) {
+     if($h->eof || $h->error) {
+       # или конец файла
+       dbg_print("PARENT: eof, h=".fileno($h));
+       $selread->remove($h);
+       $selwrite->remove($h);
+       close $h;
+       $childs--;
+     } else {
+       # считываем строку и выводим ее
+       dbg_print("PARENT: reading from child, h=".fileno($h));
+       my $line = $h->getline();
+       chomp($line);
+       print "$line\n";
+     }
+   }
+
+   # можем писать в сокет
+   for my $h(@{$write}) {
+     # есть что-нибудь в STDIN?
+     if(my $line = <STDIN>) {
+       if($total_lines) {
+         my $new_percent = int(++$line_number*$percent_mult);
+         if($new_percent > $old_percent) {
+           $old_percent = $new_percent;
+           chomp(my $date = `date '+%d.%m.%y %H:%M'`);
+           print STDERR "[$date] parsing line $line_number/$total_lines ($new_percent%)\n";
+         }
+       }
+       # передаем строку дочернему процессу
+       chomp($line);
+       dbg_print("PARENT: writing to child, h=".fileno($h).", line=$line");
+       $h->say($line); # добавляет \n на конце
+       $h->flush(); # отправляем данные немедлено
+     } else {
+       # в STDIN пусто
+       dbg_print("PARENT: no input, h=".fileno($h));
+       for my $hh($selwrite->handles()) {
+         shutdown($hh, 1); # закрываем сокет на запись
+         $selwrite->remove($hh);
+       }
+       last;
+     }
+   }
+}
+
+# отладочный вывод
+sub dbg_print {
+   my $msg = $_[0];
+   print STDERR "$msg\n" if(DEBUG);
+}
+
+#!/usr/bin/perl
+
+# get-links.pl
+# (c) Alexandr A Alexeev
+# http://eax.me/
+
+use strict;
+use utf8;
+use HTML::LinkExtractor;
+
+use constant WGET_ARGS => '--timeout=10 -T 2';
+use constant MAX_URL_LENGTH => 256;
+use constant MAX_ANCHOR_LENGTH => 128;
+use constant MAX_PAGE_SIZE => 256000;
+use constant DEBUG => 0;
+
+my $max_urls = abs(shift);
+
+while(my $url = <>) {
+   chomp($url);
+   # "нормализуем" ссылку
+   $url = "http://$url" if($url !~ /^http:\/\//i);
+   next if(length($url) > MAX_URL_LENGTH);
+   # все, что можно, приводим к нижнему регистру
+   $url = "\L$1\E$2" if($url =~ /^([^\?]+)(\?.*)?$/);
+
+   print STDERR "url == $url\n" if(DEBUG);
+
+   my $rurl = $url;
+   $rurl =~ s/'/\'/g; 
+   my $cmd = "wget ".WGET_ARGS." -q '$rurl' -O - | head -c ".MAX_PAGE_SIZE." | enconv -L none -x utf-8";
+   my $data = `$cmd`;
+
+   next if($?);
+   my $fsize = length($data);
+
+   ################
+   ### PAYLOAD HERE ###
+   ################
+
+   my $lx = HTML::LinkExtractor->new();
+   $lx->parse(\$data);
+   my $links = $lx->links();
+
+   @{$links} = grep {
+     $_->{tag} eq "a" &&
+     $_->{href} !~ /^(mailto|javascript|https|ftp):/i;
+   } @{$links};
+
+   my $num_links = scalar @{$links};
+   my $i = 0; 
+   for(@{$links}) {
+     my %h = %{$_};
+     my $href = abs_url($h{href}, $url);
+     next if(length($href) > MAX_URL_LENGTH);
+
+     # TODO: можно фильтровать по зонам, расширению файла, уровню домена и тд
+
+     my ($html, $text) = $h{_TEXT} =~ /^<a([^>]*)>(.*)<\/a>$/i;
+     my $follow = $html !~ /rel=['"]{1}[^'"]*nofollow[^'"]*['"]{1}/i;
+     $follow = $follow ? "dofollow" : "nofollow";
+
+     my $type = $text =~ /^<img[^>]*>$/;
+     # $type = $type ? "image" : ( $text =~ /[<>]/ ? "other" : "text");
+     # print "$text:$type:$href:$follow\n";
+     $type = $text =~ /[<>]/ ? "other" : "text";
+
+     utf8::decode($text);
+     $text = substr($text, 0, MAX_ANCHOR_LENGTH)
+       if(length($text) > MAX_ANCHOR_LENGTH);
+     $text =~ s/\s/ /gs;
+     utf8::encode($text);
+     if($type eq "text") {
+       # обрезаем id элемента
+       $href = $1 if($href =~ /^[^#]+#/);
+       # вырезаем сессии
+       $href =~ s/&(PHPSES)?SID=[0-9a-f]+//ig;
+       $href =~ s/\?(PHPSES)?SID=[0-9a-f]+&?$//ig;
+       $href =~ s/\?(PHPSES)?SID=[0-9a-f]+&/\?/ig;
+       # все, что можно - к нижнему регистру
+       $href = "\L$1\E$2" if($href =~ /^([^\?]+)(\?.*)?$/);
+
+       # urlencode
+       $url =~ s/([^a-zA-Z0-9\%\&\?\:\;\/\=\.\,\#\-\_]{1})/sprintf("%%%02x",ord($1))/eg;
+       $href =~ s/([^a-zA-Z0-9\%\&\?\:\;\/\=\.\,\#\-\_]{1})/sprintf("%%%02x",ord($1))/eg;
+
+       print "$url\t$href\t$text\t$follow\t$num_links\t$fsize\n" if($type eq "text");
+       last if($max_urls && ++$i == $max_urls);
+     }
+   } # for links
+} # while url
+
+
+# see http://perlmonks.org/?node_id=523679
+sub abs_url {
+     my ( $relative, $base ) = @_;
+     return $relative if $relative =~ m{ \A http:// }ix;
+     my ( $host, $hostrelative_abs ) = $base =~ m{
+         \A
+         http:// # skip scheme
+         ([^/]*) # capture hostname
+         /*      # skip front slashes
+         (.*?)   # capture everything that follows, but
+         [^/]*   # leave out the optional final non-directory component
+         \z
+     }ix;
+     $hostrelative_abs = '' if $relative =~ m!^/!;
+     my $abs_url = join '/', $host, $hostrelative_abs, $relative;
+     # replace '//' or '/./' with '/'
+     1 while $abs_url =~ s{ / \.? (?=/|\z) }{}x;
+     # remove '/foo/..' (but be careful to skip '/../..')
+     1 while $abs_url =~ s{ / (?!\.\.) [^/]+ / \.\. (?=/|\z) }{}x;
+     return "http://$abs_url";
+}
+
+#!/usr/bin/perl
+
+# parse-sites.pl
+# (c) Alexandr A Alexeev 2010
+# http://eax.me/
+
+use strict;
+# обрабатываем страницы до MAX_LEVEL УВ
+use constant MAX_LEVEL => 3;
+# с главной страницы обрабатываем MAX_LINKS ссылок
+use constant MAX_LINKS => 500;
+# с прочих - MIN_LINKS ссылок
+use constant MIN_LINKS => 150;
+# во сколько потоков парсим страницы
+use constant NUM_CHILDS => 128;
+
+my $fname = shift;
+
+die "Usage: $0 <fname>\n" unless($fname);
+die "No such file - '$fname'\n" unless(-f $fname);
+
+# производим чистку
+`(rm work/* && rm result/* && rm tmp/*) 2>&1 > /dev/null`;
+
+# копируем список первых страниц
+`cat $fname | sort -u -T ./tmp | \
+   perl -e 'while(<>){\$_="http://\$_" unless(\$_=~/^http:\\\/\\\//);print \$_;}' | \
+   rl > ./work/urls1.txt`;
+
+chomp(my $date = `date '+%d.%m.%y %H:%M'`);
+print "[$date] START\n";
+
+for my $level(1..MAX_LEVEL) {
+   chomp($date = `date '+%d.%m.%y %H:%M'`);
+   print "[$date] LEVEL: $level\n";
+   my $max_links = $level == 1 ? MAX_LINKS : MIN_LINKS;
+   my $num_urls = abs(`wc -l ./work/urls$level.txt`);
+   print "[$date] DOWNLOADING $num_urls URLS\n";
+   system("cat ./work/urls$level.txt | ./fork.pl './get-links.pl $max_links' ".NUM_CHILDS.
+            " $num_urls > ./result/level$level.txt"); 
+   last if($level == MAX_LEVEL);
+
+   chomp($date = `date '+%d.%m.%y %H:%M'`);
+   print "[$date] SEARCHING NEW URLS\n";
+   my $next_level = $level + 1;
+   `cat ./work/urls$level.txt >> ./work/done.txt`;
+
+   `cat ./result/level$level.txt | cut -f 2 | sort -u -T ./tmp |
+      grep -vFxf ./work/done.txt | rl > ./work/urls$next_level.txt`; 
+   die "ERROR: \$? = $?" if($?); 
+}
+
+chomp($date = `date '+%d.%m.%y %H:%M'`);
+print "[$date] ALL DONE!\n";
+