2012-11-12

perl - 小說網頁合併 lwp1.pl

lwp1.pl(小說網頁合併)

2012-11-12
因為沒備份, 舊程式損毀, 重寫

我的wiki - lwp1.pl(小說網頁合併)
  • 執行程式,產生附名 .html 的合併 TXT 檔案(來源為黃金屋小說-烈空)
    perl lwp1.pl http://tw.hjwzw.com/Book/Chapter/2393 0 20 _
         > hjw_2393_00-20.html
    
  • 小說網頁合併程式 : lwp1.pl
    #!/usr/bin/perl
    
    # 程式 : lwp1.pl
    # 目的 : 抓取網頁中的 link
    
    # V 0.2.1  2012-11-11 改 黃金屋 $head_link = 39
    
    # use LWP;
    use LWP::Simple;
    
    use HTML::TreeBuilder;
    use HTML::FormatText;
    use HTML::Parse;
    
    use utf8;
    use Encode;
    
    my $argv_no = $#ARGV;
    
    if ($argv_no < 2) {
      print("argv_no : $argv_no\n");
      print("argv is less than 3\n");
    
      print("執行 : program  url_link    start_link     stop_link \n");
      die("Usage: ./$0 http://tw.hjwzw.com/Book/Chapter/2239   10  20 \n");
    }
    my $urlStr     = $ARGV[0];            # URL
    
    my $startNum   = $ARGV[1];            # 開始數
    my $endNum     = $ARGV[2];            # 結束數
    
    # 取得命令列中的網頁位址,放入 $url 中
    my $url=shift || die "您沒有輸入 url 網址!
    http://tw.futures.finance.yahoo.com/future/\n";
    
    my ($html_url, $ascii, $UrlInHtml);
    
    # 2012-11-11
    my $head_link = 38;   # 開頭無用 link 數 ; # 黃金屋 http://tw.hjwzw.com/Book/Chapter/2239
    
    # my $head_link = 39;   # 開頭無用 link 數 ; # 黃金屋 http://tw.hjwzw.com/Book/Chapter/2239
    my $tail_link = 24;   # 結尾無用 link 數 ; # 黃金屋 http://tw.hjwzw.com/Book/Chapter/2239
    # my $head_link = 18;   # 開頭無用 link 數 ; # 588中文 http://www.588zw.com/modules/article/reader.php?aid=7150
    # my $tail_link = 10;   # 結尾無用 link 數 ; # 588中文 http://www.588zw.com/modules/article/reader.php?aid=7150
     
    print $urlStr  , "    Start Page : " , $startNum , "\t Stop Page : " , $endNum ."
    \n";
    
    print "=========================================================================
    \n";
    binmode(STDOUT, ':encoding(utf8)');
    binmode(STDIN, ':encoding(big5)');
     
    ($host, $port, $file)=($urlStr=~m|(http://[^/:]+):{0,1}(\d*)(\S*)$|);
     
    my $ua=LWP::UserAgent->new;
    my $res=$ua->get($urlStr);
    
    die "Can't get $url ", $res->status_line unless $res->is_success;
    my $html=$res->content;
    
    my $root=HTML::TreeBuilder->new_from_content($html);
    my @links=$root->look_down(
       _tag=>'a',
    );
    $strtLnk=$head_link + $startNum;
    
    if ( $strtLnk < $head_link ) {
      $strtLnk = $head_link;
    }
    $stopLnk=$head_link + $endNum;
    
    if ( $stopLnk > $#links - $tail_link ) {
      $stopLnk = $#links - $tail_link;
    }
    # print "前導連結數=$head_link  結尾連結數=$tail_link\n";
    
    # print "開始頁=$startNum     結束頁=$endNum\n";
    # print "實際開始=$strtLnk      實際終止=$stopLnk\n";
     
    for($j=$strtLnk; $j<=$stopLnk; $j++){     # 取出選定的 link
    
       # print $host . ' - ' . $port . ' - ' .$links[$j]->attr('href'),' ', $links[$j]->as_trimmed_text, "\n";
       ($hhost, $hport, $hfile)=($links[$j]->attr('href')=~m|(http://[^/:]+):{0,1}(\d*)(\S*)$|);
    
       # $utf8Str = $links[$j]->as_trimmed_text;
       $utf8Str = decode("utf8", $links[$j]->as_trimmed_text );
    
       my $ss = $utf8Str;
       # $ss =~ s/ //g;
    
       if ( $hhost ne '' ) {
    
          # print $j, '_ ', $links[$j]->attr('href'),' ', $utf8Str , "\n";
          $UrlInHtml = $links[$j]->attr('href');
       }
       else {
          # print $j, ' ', $host , $links[$j]->attr('href'),' ', $utf8Str , "\n";
          $UrlInHtml = $host . $links[$j]->attr('href');
       }
     
       # 2012-11-11
       print $utf8Str . " " . $UrlInHtml . "
    \n";
    
       print "=========================================================================
    \n";
       $html_url = get($UrlInHtml);
    
       $ascii = HTML::FormatText->new->format(parse_html($html_url));
       # $ascii =~ s/ //g;
    
       $ascii =~ s/^  //g;
       $startprint = 0;
       my @text = split(/\n/, $ascii);
    
       for($kk=0; $kk<=$#text - 17; $kk++) {
          # print "$text[$kk]\n";
          # if ( $text[$kk] =~ m/$ss/ ) {
          if ( $startprint == 3 ) {  print $text[$kk] . "
    \n";  }
          if ( $text[$kk] =~ m/$ss/ ) {
             if ( $startprint == 2 ) {
                $startprint = 3;
                print "\$startprint=$startprint" . "
    \n";
             }
             if ( $startprint == 1 ) { $startprint = 2; }
             if ( $startprint == 0 ) { $startprint = 1; }
          }
       }
       print "
    \n";
       print "=========================================================================
    \n\n";
    }
    $root->delete;
    exit;
    

相關 :
----

沒有留言: