php多线程爬虫（pthread.so多线程扩展）爬某某站点

之前，本博客有发过一篇文章，写的是用php爬某博客的文章（当然，在项目中，还是用火车头这样专业工具去爬咯~~~但是在学习的过程中，还是要多重复造轮子的），当时爬那300多页文章，共3k篇左右文章（包括图片），单线程需要好几个小时，当时也是用面向过程方式写的。。。对于多线程的研究，始于项目测QPS的需要。其实，在小公司真的很苦逼，没有专业的运维团队，服务器环境搭建部署，都要我这个后端程序员完成。环境采用nginx软负载均衡内网的apache webserver，推广运营预估了用户量，然后就开始测试当前服务器配置的QPS，一开始用apache的ab测，发现有太多局限性，后来，我们的资深andriod客户端帮我用java写了个多线程压力测试，于是，我就决定去找找php有没有多线程呢~~~

php作为一个解释性语言，zend引擎没有内置的多线程模块，可以用c/cpp开发扩展吗，果然，就找到了，pthread这个扩展，这个扩展支持php5.3以后的版本，因为php7对zval有改动过，要支持php7应该需要改写，所以应该是不姿瓷的吧。pthread扩展装起来…

都是爬的同一个网站（www.heu8.com），同样的页数（包括下载图片，写入mysql数据库）。多线程花了75秒，单线程花了201秒。(地址soft.feehi.com)

上代码(多线程版爬虫，oop封装，heu8.php为执行脚本，spider.php为爬虫类，pthread.php为多线程类)：

heu8.php

<?php 
    require_once “spider.php”;
    $spider = new \fee\Spider( true );
    $start = isset( $argv[1] ) ? $argv[1] : 2;//开始的页码
    $length = isset( $argv[2] ) ? $argv[2] : 0;//需要抓取的页数
    $spider->db = [‘dsn’=>”mysql:host=localhost;dbname=spider” , ‘user’=>’root’ , ‘pass’=>” , ‘table’=>’heu8_thread’ ];
    $url = “http://www.heu8.com/page/”;
    $spider->setReplace( [
        ‘http://www.heu8.com/wp-content/themes/BYMT/images/images_error.jpg’ => ‘http://img.feehi.com/public/loadImgError.jpg’,
    ] );//抓取后文章自字符替换，把文章中的key替换成value
    $spider->thumbImgUrl = ‘http://7othwv.com1.z0.glb.clouddn.com/soft/’;//文章缩略图地址的host部分，我把图片会放到七牛，所以这里配置了，默认为空
    $spider->imgUrl = ‘http://7othwv.com1.z0.glb.clouddn.com/soft/’;//文章图片地址的host部分，我把图片会放到七牛，这里配置了，默认为空     $pattern = [
        ‘list’ => ‘/<h2 class=”post-title”>\s*<a href=”(.*)”/U’,
        ‘list_img’ => ‘/<div class=”post-thumbnail”>.*<a href=”.*” rel=”bookmark” title=”.*”>.*<img src=”(.*)”/isU’,
        ‘title’ => ‘/<h2 class=\”post-title\”>(.*)<\/h2>/U’,
        ‘content’ => ‘/<div class=”post-content”>(.*)<\/div>/isU’,
        ‘content_img’ => ‘/<img.*src=”(.*)”/isU’,
    ];//依次为列表页匹配详细页正则，列表页缩略图url正则，文章标题正则，文章内容正则，文章图片地址正则
    $spider->run( $url , $pattern , $start , $length );//开始爬取页面
    echo “抓取完成，下载图片/日志文件，请在脚本同级目录查找\r\n”;

spider.php

<?php
namespace fee;
set_time_limit(0);//取消超时限制
date_default_timezone_set(‘PRC’);
include “Pthread.php”;
class Spider{
   private $db=false;数据库连接，不配置则写入文本记录
   private $logTxt;日志文件名，默认为执行脚本的时间
   private $echoLog;是否输出控制台日志，默认不输出
   private $replaceFind = [];
   private $replaceArray = [];
   private $imgUrl = ”;
   private $thumbImgUrl = ”;

   public function __construct( $echoLog = false , $logTxt=” ){
      $this->echoLog = $echoLog;
      if( $logTxt == ” )
          $this->logTxt = date(‘Y-m-d H-i-s’).’.txt’;
      else
          $this->logTxt = $logTxt;
      }
      
   public function __set( $k , $v ){
      $this->$k = $v;
   }

   public function __get( $k ){
      return $this->$k;
   }

   public function setReplace( $replace ){
      foreach( $replace as $k => $v ){
         $this->replaceFind[] = $k;
         $this->replaceArray[] = $v;
      }
   }

   public function run( $url , $pattern=[] , $start=2 , $length=0 ){
      $this->log( “抓取开始” );
      $startTime = microtime( true );
      $articleCount = 0;
      for($i=$start;$i>$length;$i–){
      $this->log( “正在分析第{$i}页” );
      $listUrl = $url.$i;
      $content = file_get_contents( $listUrl );
      preg_match_all($pattern[‘list’],$content, $matches);
      $this->log( “第{$i}页找到”.count($matches[1]).”篇文章” );
      $articleCount += count($matches[1]);
      preg_match_all($pattern[‘list_img’],$content,$matchesThumb);
      $thumbPic = [];
      $matchesThumb[1] = array_reverse( $matchesThumb[1] );
      foreach($matchesThumb[1] as $ThumbK => $ThumbV){
         $this->log( “正在下载第{$i}页的第”.($ThumbK+1).”张缩略图” );
         $dataThumb = file_get_contents($ThumbV);
         $infoThumb = pathinfo($ThumbV);
         $filePath = ‘thumb/’.date(‘Y-m-d’).’/’;
         $pathThumb = dirname(__FILE__).’/’.$filePath;
         if( !is_dir($pathThumb) ){
            mkdir( $pathThumb , 0777 , true );
         }
         $rand = rand(0,10000).’_’;
         $pathThumb .= $rand.urlencode($infoThumb[‘basename’]);
         $thumbPic[] = $this->thumbImgUrl.$filePath.$rand.urlencode($infoThumb[‘basename’]); $fp = @fopen( $pathThumb , ‘w’ );
         @fwrite( $fp , $dataThumb );
        //$this->log( “第{$i}页的第”.($ThumbK+1).”张缩略图下载完成” );
      }
      $matches[1] = array_reverse( $matches[1] );
      $pool = [];
      $j=0;
      foreach($matches[1] as $k => $v){
      $j++;
      $pool[$j] = new \fee\Pthread( $v/**url**/ , $i/**page**/ , $k/**文章**/ , $pattern , $this , $thumbPic );
      }
      foreach( $pool as $key => $val ){
         $val->start();
      }
   }
   while( count( $pool ) ){
      foreach( $pool as $poolk => $poolv ){
         if( $poolv->enddd == 1 ) unset( $pool[$poolk] );
      }
   }
   $endTime = microtime( true );
   $intvalTime = $endTime – $startTime;
   $totalPage = $start-$length;
   $this->log( “共分析{$totalPage}页，抓取{$articleCount}篇文章，耗时{$intvalTime}s” );
 }

   public function log( $log ){
      file_put_contents( $this->logTxt , date(‘Y-m-d H:i:s’).” $log\r\n” , FILE_APPEND );
      $log = iconv(‘utf-8′,’GB2312’,$log);
      if( $this->echoLog ) echo date(‘Y-m-d H:i:s’).” $log\r\n”;
   }

}
?>

pthread.php

<?php
namespace fee;
class Pthread extends \Thread{
   public $url;
   public $page;
   public $which;
   private $db;
   public $enddd = 0;
   public function __construct( $url , $page , $which , $pattern , $spider , $thumbPic){
      $this->url = $url;
      $this->page = $page;
      $this->which = $which;
      $this->pattern = $pattern; 
      $this->spider = $spider;
      $this->thumbPic = $thumbPic;
   }

   public function run( ){
      $db = false;
      try{
      $db = new \PDO( $this->spider->db[‘dsn’] , $this->spider->db[‘user’] , $this->spider->db[‘pass’] );
      $db->query( “set names utf8” );
      }catch( Exception $e ){
         echo $e->getMessage();
         exit;
      }
      $db->query( “update {$this->spider->db[‘table’]} set checked_times=checked_times+1” );
      if( is_array( $row=$db->query( “select * from {$this->spider->db[‘table’]} where url='{$this->url}'” )->fetch(\PDO::FETCH_ASSOC) ) ){
      $this->spider->log( “{$this->url}在”.date(‘Y-m-d H:i:s’,$row[‘created_at’]).’已经抓取过了，本次未抓取。’ );
      $this->enddd = 1;
      return;
   }
   $this->which = $this->which+1;
   $this->spider->log( “正在分析{$this->page}页第”.($this->which).”篇文章…” );
   $content2 = file_get_contents( $this->url );
   //$content2 = mb_convert_encoding( $content2, ‘UTF-8’, ‘UTF-8,GBK,GB2312,BIG5’ );
   preg_match($this->pattern[‘title’],$content2,$matches2);
   $title = $matches2[1];
   //$title = iconv(‘utf-8′,’GB2312’,$title);
   $this->spider->log( “抓取 {$this->which} 篇文章 {$this->url}成功。标题:{$title}…” );
   preg_match($this->pattern[‘content’],$content2,$matches2);
   $article = $matches2[1];
   preg_match_all($this->pattern[‘content_img’],$article,$pics);
   foreach($pics[1] as $this->which2 => $v2){
      $this->spider->log( ‘本页包含’.count($pics[1]).”张图，正在下载第”.($this->which2+1).”张…”    );
   $data = file_get_contents($v2);
   $this->pagenfo = pathinfo($v2);
   $filePath = ‘uploads/’.date(‘Y-m-d’).’/’;
   $path = dirname(__FILE__).’/’.$filePath;
   if( !is_dir($path) ){
      mkdir($path,0777,true);
   }
   $rand_pic = rand(0,10000).’_’;
   $path .= $rand_pic.urlencode($this->pagenfo[‘basename’]);
   $fp = @fopen( $path , ‘w’ );
   @fwrite( $fp , $data );
   $filePahNoUpload = str_replace( ‘uploads/’ , ” , $filePath );
   $article = str_replace( $v2 , $this->imgUrl.$filePahNoUpload.$rand_pic.urlencode($this->pagenfo[‘basename’]) , $article );
   }
   if( isset( $this->spider->replaceFind[0] ) ) $article = str_replace( $this->spider->replaceFind , $this->spider->replaceArray , $article );
   $article = addslashes( $article );
   $time = time();
   if( $db != false ){
   if( $db->exec( “insert into {$this->spider->db[‘table’]}(title,content,thumb,created_at,url) values(‘{$title}’,'{$article}’,'{$this->thumbPic[$this->which]}’,$time,’$this->url’)” ) ){
   $this->spider->log( “{$this->url}入库成功…” );
   }else{
      $this->spider->log( “{$this->url}入库失败…” );
   }else{
      file_put_contents( ‘spider.txt’ , $title.’ ‘.$article.’ ‘.$thumbPic[$this->which].’ ‘.$time.’ ‘.$this->url , FILE_APPEND );
}
   //file_put_contents( ‘spider.txt’ , “第 {$this->page} 页第 “.($this->which+1).’ ‘.$this->url.’ ‘.$title.”\r\n” , FILE_APPEND );
   $this->enddd = 1;
   }
}
?>

当然这个多线程爬虫还没有很完善，分析列表页后，会创建该列表页文章数目的线程爬取，而在爬列表页缩略图的时候依然是单线程。不过，如果分析列表页就用多线程，站长估计会误认为是攻击，而封我ip了，还有一个没有解决的问题是，多线程爬文章的顺序打乱了，先返回结果的线程先入库。

转载请注明：飞嗨 » php多线程爬虫（pthread.so多线程扩展）爬某某站点

一	二	三	四	五	六	日
« 三
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31