PHP蜘蛛爬虫详解

发表于：2020-08-31 18:40:01浏览：125次TAG： #PHP

PHP蜘蛛爬虫开发文档

官方文档

https://doc.phpspider.org/

githup地址

https://github.com/owner888/phpspider

phpspider-master/core 文件介绍

文件名	描述
init.php	公共入口文件
constans.php	公共入口文件
phpspider.php	核心类文件
-	-
configs详解	-
requests.php	请求类文件
selector.php	选择器类文件
db.php	数据库类文件
cache.php	缓存类文件
log.php	日志类文件
queue.php	Redis操作类文件
util.php	实用函数集合类文件
worker.php	多进程操作类

加载核心文件

require './vendor/autoload.php';
use phpspider\core\phpspider;

requests.php 请求类详解

成员	描述
input_encoding	输入编码明确指定输入的页面编码格式(UTF-8,GB2312,…..)，防止出现乱码,如果设置null则自动识别
output_encoding	输出编码明确指定输出的编码格式(UTF-8,GB2312,…..)，防止出现乱码,如果设置null则为utf-8
encoding	获取网页编码
content	获取响应内容 - 转码前内容
text	获取响应内容 - 转码后内容
status_code	网页状态码
headers	获取响应头
request	获取请求头

# 加载 核心文件
require './vendor/autoload.php';
use phpspider\core\phpspider;
# 加载 请求类文件
use phpspider\core\requests;

# 设置 输入编码
requests::$input_encoding = null;        // null=自动识别

# 设置 输出编码
requests::$output_encoding = null;        // null=utf-8

# 获取 网页编码 
request::$encoding;

# 获取 响应内容 - 转码前内容 
request::$content;

# 获取 响应内容 - 转码后内容 
request::$text;

# 获取 网页状态码
request::$status_code;

# 获取 获取响应头 
request::$headers;

# 获取 获取请求头 
request::$request;

方法	描述
set_timeout( $timeout )	设置请求超时时间
set_proxy( $proxy )	设置请求代理
set_useragent( $useragent )	浏览器useragent(UA)
set_referer( $referer )	浏览器请求来路URL
set_header( $key, $value )	添加请求的Header
set_cookie( $key, $value, $domain = ‘’ )	添加请求的Cookie
get_cookie( $name, $domain = ‘’ )	获取请求的Cookie
set_cookies( $cookies, $domain = ‘’ )	设置请求Cookie
get_cookie( $domain = ‘’ )	获取请求的Cookie
set_client_ip( $ip )	设置请求伪IP
set_hosts( $host, $ips )	设置请求的第三方主机和IP
get( $url, $params, $allow_redirects, $cert )	用来获取某个网页
post( $url, $params, $files, $allow_redirects, $cert )	用来获取某个网页
put( $url, $params, $allow_redirects, $cert )	用来获取某个网页
delete( $url, $params, $allow_redirects, $cert )	用来获取某个网页

# 设置 请求超时时间
# 1. 单一值 (同时设置connect和read)
requests::set_timeout(10);
# 2. 数组值 (设置connect和read二者的timeout) 
requests::set_timeout( array(3, 27) );

# 设置 请求代理
1. 字符串
requests::set_proxy('http://user:pass@host:port');
2. 数组
requests::set_proxy(
    array(
        'http://user:pass@host:port',
        'http://user:pass@host:port'
    )
);

# 设置 UA头
// 1. 字符串
requests::set_useragent("Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/");
// 2. 数组
requests::set_proxy(
    array(
        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/",
        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/"
    )
);
# 设置 请求来路URL
requests::set_referer('https://www.baidu.com');

# 设置 请求的Header
requests::set_header("Referer", "http://www.baidu.com");

# 添加 请求的Cookie
requests::set_cookie("BAIDUID", "FEE96299191CB0F11954F3A0060FB470:FG=1", "http://www.baidu.com");
requests::set_cookie("BAIDUID=FEE96299191CB0F11954F3A0060FB470:FG=1", "http://www.baidu.com");

# 获取 请求的Cookie
requests::get_cookie("BAIDUID", "http://www.baidu.com");
requests::get_cookie("http://www.baidu.com");

# 设置 设置请求伪IP
// 1. 单一值
requests::set_client_ip("192.168.0.2");
// 2. 数组
requests::set_client_ip(
    array(
        "192.168.0.1",
        "192.168.0.2"
    )
);

# 设置 请求的第三方主机和IP
requests::set_hosts(
    "http://www.baidu.com", 
    array(
        "203.195.143.21",
        "203.195.143.22"
    )
);

# 发起 get 请求
requests::get("https://github.com/timeline.json");

# 发起 post 请求
// 1. 登录
requests::post(
    "http://www.domain.com", 
    array(
        "username" => "test", "password" => "test"
    )
);
// 2. 文件上传
request::post(
    "http://www.domain.com", 
    null, 
    array(
        "file1" => "test1.jpg", 
        "file2" => "test2.jpg"
    )
);

# 发起 put 请求
requests::put(
    "http://www.domain.com", 
    "{username:\"test888\",username:\"123456\"}"
);

# 发起 delete 请求
requests::delete(
    "http://www.domain.com", 
    "{username:\"test888\"}"
);

selector.php 选择器类详解

方法	描述
select( $html, $selector, $selector_type = ‘xpath’ )	选择匹配的内容
remove( $html, $selector, $selector_type = ‘xpath’ )	删除匹配的内容

/**
 * select( $html, $selector, $selector_type = 'xpath' )
 * @param $html 需要筛选的网页内容
 * @param $selector 选择器规则
 * @param $selector_type 选择器类型: xpath (默认) / regex / css
 */
# 1. xpath
$html = requests::get("https://news.163.com/20/0831/15/FLCBLJOT000189FH.html");
$data = selector::select($html, '//*[@id="endText"]');         // 读取 网易新闻 新闻内容
var_dump($data);

# 2. css
$html = requests::get("https://news.163.com/20/0831/15/FLCBLJOT000189FH.html");
$data = selector::select($html, ".post_content_main > h1", "css");        // 读取 网易新闻 详情页标题
var_dump($data);

# 3. regex
$html = requests::get("https://news.163.com/20/0831/15/FLCBLJOT000189FH.html");
$data = selector::select($html, "@<title>(.*?)</title>@", "regex");        // 读取 网易新闻 title标题内容
var_dump($data);


/**
 * remove( $html, $selector, $selector_type = 'xpath' )
 * @param $html 需要筛选的网页内容
 * @param $selector 选择器规则
 * @param $selector_type 选择器类型: xpath (默认) / regex / css
 */
$html = requests::get("https://news.163.com/20/0831/15/FLCBLJOT000189FH.html");
$html = selector::select($html, '//*[@id="endText"]');         // 读取 网易新闻 新闻内容
// 在上面获取的内容基础上，删除第一个<p>标签(原标题)
$data = selector::select($html, '//*[@id="endText"]/p[1]');
var_dump($data);

db.php 数据库类详解

# 数据配置链接
$db_config = array(
    'host' => '127.0.0.1',
    'port' => 3306,
    'user' => 'root',
    'pass' => '123456',
    'name' => 'demo_db'
);
// 数据库配置
db::set_connect('default', $db_config);
// 数据库连接
db::init_mysql();

方法	描述
query($sql)	原生SQL操作
get_one($sql)	原生SQL操作
get_all($sql)	单条查询
insert($table, $data)	单条插入
insert_batch($table, $data)	单条修改
update_batch($table, $data, $index)	批量修改
delete($table, $where)	单条删除

# query 原生操作 
// 1. 查询
$query = db::query("select * from `content`");
while($row = db::fetch($query)) {
    echo "id = {$row['id']}; name = {$row['name']}; \n";
}

// 2. 新增 
db::query("insert into `content` (`name`) values (`test`);");

// 3. 更新
db::query("update `content` set `name`='test' where `id`=1;");

// 4. 删除
db::query("delete from `content` where `id`=1;");

# get_one
$row = db::get_one("select * from `content` where `id`=1;");

# get_all
$rows = db::get_all("select * from `content` limit 5;");

# insert
$rows = db::insert('content', array('name' => 'test'));

# insert_batch
$rows = db::insert_batch(
    'content', 
    array(
        array(
            'name' => 'test1'
        ),
        array(
            'name' => 'test2'
        )
    )
);

# update_batch
db::update_batch(
    'content',
    array(
        array(
            'id' => 1,
            'name' => 'test1'
        ),
        array(
            'id' => 2,
            'name' => 'test2'
        )
    ),
    'id'     // 以 id 为条件进行修改
);

# delete
$rows = db::delete('content', "`id`=1");

使用 configs 来编写爬虫

# 加载 核心文件
require './vendor/autoload.php';
use phpspider\core\phpspider;

# 官方文档说不要删除这段注释，我并不知道有什么用，文档说加就加
/* Do NOT delete this comment */
/* 不要删除这段注释 */

# $configs = array(
    'name' => '163新闻',         // 当前爬虫名称
    'log_show' => false,         // 是否显示日志, 默认false, 可选 true (显示调试信息) | false (显示爬取面板, tail -f data/phpspider.log 查看日志)
    'log_file' => 'data/phpspider.log',            // 日志文件路径, 默认 data/phpspider.log
    'log_type' => '',             // 显示和记录的日志类型, 默认空, 可选 info(普通) | warn(警告) | debug(调试) | error(错误 )
    'input_encoding' => null,     // 输入编码, 默认null(自动识别)
    'output_encoding' => null,     // 输出编码, 默认null(null=utf-8)
    'tasknum' => 1,             // 同时工作的爬虫任务数, 默认1(单进程任务爬取)
    'multiserver' => false,     // 多服务器处理, 默认false, 可选 true | false
    'serverid' => 1,             // 服务器ID, 默认1, 启用第二天服务器可设置为2
    'save_running_state' => false,     // 保存爬虫运行状态, 默认false(不保存), 可选 true | false
    'queue_config' => array(         // redis 配置, 保存爬虫运行状态、多任务处理 和 多服务器处理 都需要 redis 来保存采集任务数据
        'host' => '127.0.0.1',
        'port' => 6379,
        'pass' => '',
        'db' => 5,
        'prefix' => 'phpspider',
        'timeout' => 30
    ),
    'proxy' => array(                 // 代理服务器，如果爬取的网站根据ip做了反爬虫，可以设置此项
        'http://host:port',
        'http://user:pass@host:port',
    ),
    'interval' => 1000,             // 爬取单个网页的时间间隔, 单位毫秒
    'timeout' => 5,                 // 爬取每个网页的超时时间, 单位秒
    'max_try' => 0,                 // 爬取每个网页失败后尝试次数, 默认0(不重复爬取)
    'max_depth' => 0,                 // 爬取网页深度, 超过深度的页面不再采集, 默认0(不限制)
    'max_fields' => 0,                 // 爬取内容网页最大条数, 默认0(不限制)
    'user_agent' => "",             // 爬取网页所使用的浏览器类型
                                    // 1. 枚举类型
                                    // phpspider::AGENT_ANDROID, 表示爬虫爬取网页时, 使用安卓手机浏览器
                                    // phpspider::AGENT_IOS, 表示爬虫爬取网页时, 使用苹果手机浏览器
                                    // phpspider::AGENT_PC, 表示爬虫爬取网页时, 使用PC浏览器
                                    // phpspider::AGENT_MOBILE, 表示爬虫爬取网页时, 使用移动设备浏览器
                                    // 2. 自定义类型
                                    // 'user_agent' => "Mozilla/5.0"
                                    // 3. 随机浏览器类型
                                    // 'user_agent' => array(
                                    //         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
                                    //         "Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_3 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13G34 Safari/601.1",
                                    //         "Mozilla/5.0 (Linux; U; Android 6.0.1;zh_cn; Le X820 Build/FEXCNFN5801507014S) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/49.0.0.0 Mobile Safari/537.36 EUI Browser/5.8.015S",
                                    // );
    'client_ip' => "",                 // 爬取网页所使用的伪IP，用于破解防采集
                                    // 1. 字符串类型
                                    // 'client_ip' => '192.168.0.2'
                                    // 2. 数组类型
                                    // 'client_ip' => array(
                                    //         '192.160.0.1',
                                    //         '192.160.0.2',
                                    // );
    'export' => array(                // 爬取数据数据导出
        'type' => 'csv',             // 导出类型 csv | sql | db
        'file' => './data/163_news.csv',         // 导出文件路径

        // 'type' => 'sql'
        // 'file' => './data/163_news.sql',
        // 'table' => 'news_table',                 // 导出db、sql数据库表名

        // 'type' => 'db'
        // 'table' => 'news_table',                 // 导出db、sql数据库表名
    ),
    'db_config' => array(             // 数据库配置
        'host'  => '127.0.0.1',
        'port'  => 3306,
        'user'  => 'root',
        'pass'  => 'root',
        'name'  => 'demo',
    ),
    'domains' => array(             // 定义爬虫爬取哪些域名下的网页, 非域名下的url会被忽略以提高爬取速度
        '163.com',
        'new.163.com'
    ),
    'scan_urls' => array(            // 定义爬虫的入口链接, 爬虫从这些链接开始爬取,同时这些链接也是监控爬虫所要监控的链接
        'https://news.163.com'
    ),
    'content_url_regexes' => array( // 定义内容页url的规则, 正则表达式 最好填写以提高爬取效率
        'https://news.163.com/\d+/\d+/\d+/\w+.html'
    ),
    'list_url_regexes' => array(     // 定义列表页url的规则, 对于有列表页的网站, 使用此配置可以大幅提高爬虫的爬取速率
        'https://news.163.com/gz/page/\d+.html'
    ),
    'fields' => array(                 // 定义内容页的抽取规则, 规则由一个个field组成, 一个field代表一个数据抽取项
        array(
            'name' => "content",         // 名称, 不能为空
            'selector' => '//*[@id="endText"]',     // 定义抽取规则, 不能为空, 默认使用xpath
            'selector_type' => 'xpath',             // 抽取规则类型, 默认xpath, 可选 xpaht | jsonpath | regex
            'required' => true,            // 是否必须的, 默认false, 可选 true | false
            'repeated' => false,         // 抽取到的内容是否多项, 默认false, 可选 false | true(结果都是数组类型)
            'children' => array(        // 为此field定义子项, 子项的定义仍然是一个fields数组
                array(
                    'name' => 'replay',         // # 例如抽取新闻下面的评论
                    'selector' => "//div[contains(@class,'replay')]"
                )
            ),
            'source_type' => 'url_content',    // 该field的数据源, 默认从当前的网页 (url_context) 中抽取数据, 可选 url_context | attached_url
            // 'source_type' => 'attached_url',
            // 'attached_url' => 'https://news.163.com/{comment_id}/comments',             // 当source_type设置为attached_url时, 定义新请求的url
        ),
        array(
            'name' => "title",
            'selector' => '//*[@id="epContentLeft"]/h1',
        )
    )
);

// 载入配置
$spider = new phpspider($configs);
// 启动爬虫
$spider->start();

PHP蜘蛛爬虫详解

PHP蜘蛛爬虫开发文档

官方文档

githup地址

phpspider-master/core 文件介绍

加载核心文件

requests.php 请求类 详解

selector.php 选择器类 详解

db.php 数据库类 详解

使用 configs 来编写爬虫

requests.php 请求类详解

selector.php 选择器类详解

db.php 数据库类详解