lwp1.pl(小說網頁合併)
2012-11-12因為沒備份, 舊程式損毀, 重寫
我的wiki - lwp1.pl(小說網頁合併)
-
執行程式,產生附名 .html 的合併 TXT 檔案(來源為黃金屋小說-烈空)
perl lwp1.pl http://tw.hjwzw.com/Book/Chapter/2393 0 20 _ > hjw_2393_00-20.html
-
小說網頁合併程式 : lwp1.pl
#!/usr/bin/perl # 程式 : lwp1.pl # 目的 : 抓取網頁中的 link # V 0.2.1 2012-11-11 改 黃金屋 $head_link = 39 # use LWP; use LWP::Simple; use HTML::TreeBuilder; use HTML::FormatText; use HTML::Parse; use utf8; use Encode; my $argv_no = $#ARGV; if ($argv_no < 2) { print("argv_no : $argv_no\n"); print("argv is less than 3\n"); print("執行 : program url_link start_link stop_link \n"); die("Usage: ./$0 http://tw.hjwzw.com/Book/Chapter/2239 10 20 \n"); } my $urlStr = $ARGV[0]; # URL my $startNum = $ARGV[1]; # 開始數 my $endNum = $ARGV[2]; # 結束數 # 取得命令列中的網頁位址,放入 $url 中 my $url=shift || die "您沒有輸入 url 網址! http://tw.futures.finance.yahoo.com/future/\n"; my ($html_url, $ascii, $UrlInHtml); # 2012-11-11 my $head_link = 38; # 開頭無用 link 數 ; # 黃金屋 http://tw.hjwzw.com/Book/Chapter/2239 # my $head_link = 39; # 開頭無用 link 數 ; # 黃金屋 http://tw.hjwzw.com/Book/Chapter/2239 my $tail_link = 24; # 結尾無用 link 數 ; # 黃金屋 http://tw.hjwzw.com/Book/Chapter/2239 # my $head_link = 18; # 開頭無用 link 數 ; # 588中文 http://www.588zw.com/modules/article/reader.php?aid=7150 # my $tail_link = 10; # 結尾無用 link 數 ; # 588中文 http://www.588zw.com/modules/article/reader.php?aid=7150 print $urlStr , " Start Page : " , $startNum , "\t Stop Page : " , $endNum ." \n"; print "========================================================================= \n"; binmode(STDOUT, ':encoding(utf8)'); binmode(STDIN, ':encoding(big5)'); ($host, $port, $file)=($urlStr=~m|(http://[^/:]+):{0,1}(\d*)(\S*)$|); my $ua=LWP::UserAgent->new; my $res=$ua->get($urlStr); die "Can't get $url ", $res->status_line unless $res->is_success; my $html=$res->content; my $root=HTML::TreeBuilder->new_from_content($html); my @links=$root->look_down( _tag=>'a', ); $strtLnk=$head_link + $startNum; if ( $strtLnk < $head_link ) { $strtLnk = $head_link; } $stopLnk=$head_link + $endNum; if ( $stopLnk > $#links - $tail_link ) { $stopLnk = $#links - $tail_link; } # print "前導連結數=$head_link 結尾連結數=$tail_link\n"; # print "開始頁=$startNum 結束頁=$endNum\n"; # print "實際開始=$strtLnk 實際終止=$stopLnk\n"; for($j=$strtLnk; $j<=$stopLnk; $j++){ # 取出選定的 link # print $host . ' - ' . $port . ' - ' .$links[$j]->attr('href'),' ', $links[$j]->as_trimmed_text, "\n"; ($hhost, $hport, $hfile)=($links[$j]->attr('href')=~m|(http://[^/:]+):{0,1}(\d*)(\S*)$|); # $utf8Str = $links[$j]->as_trimmed_text; $utf8Str = decode("utf8", $links[$j]->as_trimmed_text ); my $ss = $utf8Str; # $ss =~ s/ //g; if ( $hhost ne '' ) { # print $j, '_ ', $links[$j]->attr('href'),' ', $utf8Str , "\n"; $UrlInHtml = $links[$j]->attr('href'); } else { # print $j, ' ', $host , $links[$j]->attr('href'),' ', $utf8Str , "\n"; $UrlInHtml = $host . $links[$j]->attr('href'); } # 2012-11-11 print $utf8Str . " " . $UrlInHtml . " \n"; print "========================================================================= \n"; $html_url = get($UrlInHtml); $ascii = HTML::FormatText->new->format(parse_html($html_url)); # $ascii =~ s/ //g; $ascii =~ s/^ //g; $startprint = 0; my @text = split(/\n/, $ascii); for($kk=0; $kk<=$#text - 17; $kk++) { # print "$text[$kk]\n"; # if ( $text[$kk] =~ m/$ss/ ) { if ( $startprint == 3 ) { print $text[$kk] . " \n"; } if ( $text[$kk] =~ m/$ss/ ) { if ( $startprint == 2 ) { $startprint = 3; print "\$startprint=$startprint" . " \n"; } if ( $startprint == 1 ) { $startprint = 2; } if ( $startprint == 0 ) { $startprint = 1; } } } print " \n"; print "========================================================================= \n\n"; } $root->delete; exit;
相關 :
沒有留言:
張貼留言