avatar

Catalog
java爬虫-webmagic的初步使用

java爬虫-webmagic的初步使用

webmagic是一个国人写的java爬虫框架,简单灵活。以一个漫画网站为例,实现一个爬取漫画的爬虫。

表结构:

sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
use spider;
create table bidongcomic_info
(
comic_id varchar(20) not null primary key ,
comic_title varchar(100) default null comment '标题',
author varchar(20) comment '作者',
dsc varchar(1000) comment '作品描述',
cover_url varchar(100) comment '封面链接',
cover_url_local varchar(100) comment '本地链接'
);

create table bidongcomic_chapter
(
chap_id varchar(20) not null,
chap_title varchar(20),
comic_id varchar(100) primary key
);

create table bidongcomic_img_url(
chap_id varchar(20),
img_id int auto_increment primary key ,
img_url varchar(100)
)

表已经建起来了,使用mybatis-generator生成mapper与model

xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE generatorConfiguration
PUBLIC "-//mybatis.org//DTD MyBatis Generator Configuration 1.0//EN"
"http://mybatis.org/dtd/mybatis-generator-config_1_0.dtd">

<generatorConfiguration>
<!-- 配置mysql 驱动jar包路径.用了绝对路径 -->
<classPathEntry location="/home/jacob/.m2/repository/mysql/mysql-connector-java/8.0.18/mysql-connector-java-8.0.18.jar" />

<context id="wangyongzhi_mysql_tables" targetRuntime="MyBatis3">
<!-- 防止生成的代码中有很多注释,加入下面的配置控制 -->
<commentGenerator>
<property name="suppressAllComments" value="true" />
<property name="suppressDate" value="true" />
</commentGenerator>

<!-- 数据库连接 -->
<jdbcConnection driverClass="com.mysql.jdbc.Driver"
connectionURL="jdbc:mysql://127.0.0.1:3306/spider?useUnicode=true&amp;characterEncoding=UTF-8"
userId="root"
password="123456">
</jdbcConnection>

<javaTypeResolver >
<property name="forceBigDecimals" value="false" />
</javaTypeResolver>

<!-- 数据表对应的model层 -->
<javaModelGenerator targetPackage="top.liberty3306.spiders.modules.bidongcomic.model" targetProject="src/main/java">
<property name="enableSubPackages" value="true" />
<property name="trimStrings" value="true" />
</javaModelGenerator>

<!-- sql mapper 映射配置文件 -->
<sqlMapGenerator targetPackage="mapping" targetProject="src/main/resources">
<property name="enableSubPackages" value="true" />
</sqlMapGenerator>

<!-- mybatis3中的mapper接口 -->
<javaClientGenerator type="XMLMAPPER" targetPackage="top.liberty3306.spiders.modules.bidongcomic.dao"
targetProject="src/main/java">
<property name="enableSubPackages" value="true" />
</javaClientGenerator>

<!-- 数据表进行生成操作 schema:相当于库名; tableName:表名; domainObjectName:对应的DO -->
<table schema="spider" tableName="bidongcomic_info" domainObjectName="BidongComicInfo"
enableCountByExample="false" enableUpdateByExample="false"
enableDeleteByExample="false" enableSelectByExample="false"
selectByExampleQueryId="false">
</table>
<table schema="spider" tableName="bidongcomic_img" domainObjectName="BidongComicImg"
enableCountByExample="false" enableUpdateByExample="false"
enableDeleteByExample="false" enableSelectByExample="false"
selectByExampleQueryId="false">
</table>
<table schema="spider" tableName="bidongcomic_chapter" domainObjectName="BidongComicChapter"
enableCountByExample="false" enableUpdateByExample="false"
enableDeleteByExample="false" enableSelectByExample="false"
selectByExampleQueryId="false">
</table>
</context>
</generatorConfiguration>

mybatis-generator生成了dao,mapper,与model,现在再model上加入注解,匹配字段

chapter.java 省略getter与setter

java
1
2
3
4
5
6
7
8
9
10
@HelpUrl("https://www.bidongmh.com/booklist*")
@TargetUrl("https://www.bidongmh.com/book/*")
@ExtractBy(value = "//ul[@class=chapter-list]/li",multi = true)
public class BidongComicChapter {
@ExtractBy(value = "//li/@data-id",notNull = true)
private String chapId;
@ExtractBy("//li/a/text()")
private String chapTitle;
@ExtractByUrl(value = "https://www.bidongmh.com/book/(.*/?)",notNull = true)
private String comicId;

自定义pipeLine

java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
public class BidongComicInfoPipeLine implements PageModelPipeline {
private BidongComicService service;
public BidongComicInfoPipeLine(BidongComicService service){
this.service = service;
}
@Override
public void process(Object o, Task task) {
if(o instanceof BidongComicInfo){
service.saveComic((BidongComicInfo) o);
}
if(o instanceof BidongComicImg){
service.savImg((BidongComicImg)o);
}
if(o instanceof BidongComicChapter){
service.saveChapter((BidongComicChapter)o);
}
}
}

启动测试类

java
1
2
3
4
5
6
7
8
@Test
void start(){
OOSpider.create(Site.me().setSleepTime(1000),
new ConsolePageModelPipeline(),
BidongComicInfo.class, BidongComicChapter.class)
.addUrl("https://www.bidongmh.com/booklist")
.thread(6).run();
}
Author: Jacob
Link: http://yoursite.com/2020/01/25/java%E7%88%AC%E8%99%AB-webmagic%E4%B8%8ESpringboot%E7%9A%84%E4%BD%BF%E7%94%A8/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.
Donate
  • 微信
    微信
  • 支付寶
    支付寶