java爬虫-webmagic的初步使用
webmagic是一个国人写的java爬虫框架,简单灵活。以一个漫画网站为例,实现一个爬取漫画的爬虫。
表结构:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| use spider; create table bidongcomic_info ( comic_id varchar(20) not null primary key , comic_title varchar(100) default null comment '标题', author varchar(20) comment '作者', dsc varchar(1000) comment '作品描述', cover_url varchar(100) comment '封面链接', cover_url_local varchar(100) comment '本地链接' );
create table bidongcomic_chapter ( chap_id varchar(20) not null, chap_title varchar(20), comic_id varchar(100) primary key );
create table bidongcomic_img_url( chap_id varchar(20), img_id int auto_increment primary key , img_url varchar(100) )
|
表已经建起来了,使用mybatis-generator生成mapper与model
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
| <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE generatorConfiguration PUBLIC "-//mybatis.org//DTD MyBatis Generator Configuration 1.0//EN" "http://mybatis.org/dtd/mybatis-generator-config_1_0.dtd">
<generatorConfiguration> <classPathEntry location="/home/jacob/.m2/repository/mysql/mysql-connector-java/8.0.18/mysql-connector-java-8.0.18.jar" />
<context id="wangyongzhi_mysql_tables" targetRuntime="MyBatis3"> <commentGenerator> <property name="suppressAllComments" value="true" /> <property name="suppressDate" value="true" /> </commentGenerator>
<jdbcConnection driverClass="com.mysql.jdbc.Driver" connectionURL="jdbc:mysql://127.0.0.1:3306/spider?useUnicode=true&characterEncoding=UTF-8" userId="root" password="123456"> </jdbcConnection>
<javaTypeResolver > <property name="forceBigDecimals" value="false" /> </javaTypeResolver>
<javaModelGenerator targetPackage="top.liberty3306.spiders.modules.bidongcomic.model" targetProject="src/main/java"> <property name="enableSubPackages" value="true" /> <property name="trimStrings" value="true" /> </javaModelGenerator>
<sqlMapGenerator targetPackage="mapping" targetProject="src/main/resources"> <property name="enableSubPackages" value="true" /> </sqlMapGenerator>
<javaClientGenerator type="XMLMAPPER" targetPackage="top.liberty3306.spiders.modules.bidongcomic.dao" targetProject="src/main/java"> <property name="enableSubPackages" value="true" /> </javaClientGenerator>
<table schema="spider" tableName="bidongcomic_info" domainObjectName="BidongComicInfo" enableCountByExample="false" enableUpdateByExample="false" enableDeleteByExample="false" enableSelectByExample="false" selectByExampleQueryId="false"> </table> <table schema="spider" tableName="bidongcomic_img" domainObjectName="BidongComicImg" enableCountByExample="false" enableUpdateByExample="false" enableDeleteByExample="false" enableSelectByExample="false" selectByExampleQueryId="false"> </table> <table schema="spider" tableName="bidongcomic_chapter" domainObjectName="BidongComicChapter" enableCountByExample="false" enableUpdateByExample="false" enableDeleteByExample="false" enableSelectByExample="false" selectByExampleQueryId="false"> </table> </context> </generatorConfiguration>
|
mybatis-generator生成了dao,mapper,与model,现在再model上加入注解,匹配字段
chapter.java 省略getter与setter
1 2 3 4 5 6 7 8 9 10
| @HelpUrl("https://www.bidongmh.com/booklist*") @TargetUrl("https://www.bidongmh.com/book/*") @ExtractBy(value = "//ul[@class=chapter-list]/li",multi = true) public class BidongComicChapter { @ExtractBy(value = "//li/@data-id",notNull = true) private String chapId; @ExtractBy("//li/a/text()") private String chapTitle; @ExtractByUrl(value = "https://www.bidongmh.com/book/(.*/?)",notNull = true) private String comicId;
|
自定义pipeLine
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| public class BidongComicInfoPipeLine implements PageModelPipeline { private BidongComicService service; public BidongComicInfoPipeLine(BidongComicService service){ this.service = service; } @Override public void process(Object o, Task task) { if(o instanceof BidongComicInfo){ service.saveComic((BidongComicInfo) o); } if(o instanceof BidongComicImg){ service.savImg((BidongComicImg)o); } if(o instanceof BidongComicChapter){ service.saveChapter((BidongComicChapter)o); } } }
|
启动测试类
1 2 3 4 5 6 7 8
| @Test void start(){ OOSpider.create(Site.me().setSleepTime(1000), new ConsolePageModelPipeline(), BidongComicInfo.class, BidongComicChapter.class) .addUrl("https://www.bidongmh.com/booklist") .thread(6).run(); }
|