阅读 110

SrpingBoot+WebMagic+MyBaties实现爬虫和数据入库

WebMagic是一个开源爬虫框架,本项目通过在SpringBoot项目中使用WebMagic去抓取数据,最后使用MyBatis将数据入库。

本项目代码地址:ArticleCrawler: SrpingBoot+WebMagic+MyBaties实现爬虫和数据入库 (gitee.com)

创建数据库:

本示例中库名为article,表名为cms_content,表中包含contentId、title、date三个字段。

CREATE TABLE `cms_content` (   `contentId` varchar(40) NOT NULL COMMENT '内容ID',   `title` varchar(150) NOT NULL COMMENT '标题',   `date` varchar(150) NOT NULL COMMENT '发布日期',   PRIMARY KEY (`contentId`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='CMS内容表'; 复制代码

新建SpringBoot项目:

1、配置依赖pom.xml

<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">     <modelVersion>4.0.0</modelVersion>     <parent>         <groupId>org.springframework.boot</groupId>         <artifactId>spring-boot-starter-parent</artifactId>         <version>2.5.5</version>         <relativePath/>     </parent>     <groupId>com.example</groupId>     <artifactId>Article</artifactId>     <version>0.0.1-SNAPSHOT</version>     <name>Article</name>     <description>Article</description>     <properties>         <java.version>1.8</java.version>         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>         <maven.test.skip>true</maven.test.skip>         <maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>         <maven.resources.plugin.version>3.1.0</maven.resources.plugin.version>         <mysql.connector.version>5.1.47</mysql.connector.version>         <druid.spring.boot.starter.version>1.1.17</druid.spring.boot.starter.version>         <mybatis.spring.boot.starter.version>1.3.4</mybatis.spring.boot.starter.version>         <fastjson.version>1.2.58</fastjson.version>         <commons.lang3.version>3.9</commons.lang3.version>         <joda.time.version>2.10.2</joda.time.version>         <webmagic.core.version>0.7.5</webmagic.core.version>     </properties>     <dependencies>         <dependency>             <groupId>org.springframework.boot</groupId>             <artifactId>spring-boot-starter-web</artifactId>         </dependency>         <dependency>             <groupId>org.springframework.boot</groupId>             <artifactId>spring-boot-starter-test</artifactId>             <scope>test</scope>         </dependency>         <dependency>             <groupId>org.springframework.boot</groupId>             <artifactId>spring-boot-configuration-processor</artifactId>             <optional>true</optional>         </dependency>         <dependency>             <groupId>mysql</groupId>             <artifactId>mysql-connector-java</artifactId>             <version>${mysql.connector.version}</version>         </dependency>         <dependency>             <groupId>com.alibaba</groupId>             <artifactId>druid-spring-boot-starter</artifactId>             <version>${druid.spring.boot.starter.version}</version>         </dependency>         <dependency>             <groupId>org.mybatis.spring.boot</groupId>             <artifactId>mybatis-spring-boot-starter</artifactId>             <version>${mybatis.spring.boot.starter.version}</version>         </dependency>         <dependency>             <groupId>com.alibaba</groupId>             <artifactId>fastjson</artifactId>             <version>${fastjson.version}</version>         </dependency>         <dependency>             <groupId>org.apache.commons</groupId>             <artifactId>commons-lang3</artifactId>             <version>${commons.lang3.version}</version>         </dependency>         <dependency>             <groupId>joda-time</groupId>             <artifactId>joda-time</artifactId>             <version>${joda.time.version}</version>         </dependency>         <dependency>             <groupId>us.codecraft</groupId>             <artifactId>webmagic-core</artifactId>             <version>${webmagic.core.version}</version>             <exclusions>                 <exclusion>                     <groupId>org.slf4j</groupId>                     <artifactId>slf4j-log4j12</artifactId>                 </exclusion>             </exclusions>         </dependency>     </dependencies>     <build>         <plugins>             <plugin>                 <groupId>org.apache.maven.plugins</groupId>                 <artifactId>maven-compiler-plugin</artifactId>                 <version>${maven.compiler.plugin.version}</version>                 <configuration>                     <source>${java.version}</source>                     <target>${java.version}</target>                     <encoding>${project.build.sourceEncoding}</encoding>                 </configuration>             </plugin>             <plugin>                 <groupId>org.apache.maven.plugins</groupId>                 <artifactId>maven-resources-plugin</artifactId>                 <version>${maven.resources.plugin.version}</version>                 <configuration>                     <encoding>${project.build.sourceEncoding}</encoding>                 </configuration>             </plugin>             <plugin>                 <groupId>org.springframework.boot</groupId>                 <artifactId>spring-boot-maven-plugin</artifactId>                 <configuration>                     <fork>true</fork>                     <addResources>true</addResources>                 </configuration>                 <executions>                     <execution>                         <goals>                             <goal>repackage</goal>                         </goals>                     </execution>                 </executions>             </plugin>         </plugins>     </build>     <repositories>         <repository>             <id>public</id>             <name>aliyun nexus</name>             <url>http://maven.aliyun.com/nexus/content/groups/public/</url>             <releases>                 <enabled>true</enabled>             </releases>         </repository>     </repositories>     <pluginRepositories>         <pluginRepository>             <id>public</id>             <name>aliyun nexus</name>             <url>http://maven.aliyun.com/nexus/content/groups/public/</url>             <releases>                 <enabled>true</enabled>             </releases>             <snapshots>                 <enabled>false</enabled>             </snapshots>         </pluginRepository>     </pluginRepositories> </project> 复制代码

2、创建CmsContentPO.java

数据实体,和表中3个字段对应。

package site.exciter.article.model; public class CmsContentPO {     private String contentId;     private String title;     private String date;     public String getContentId() {         return contentId;     }     public void setContentId(String contentId) {         this.contentId = contentId;     }     public String getTitle() {         return title;     }     public void setTitle(String title) {         this.title = title;     }     public String getDate() {         return date;     }     public void setDate(String date) {         this.date = date;     } } 复制代码

3、创建CrawlerMapper.java

package site.exciter.article.dao; import org.apache.ibatis.annotations.Mapper; import site.exciter.article.model.CmsContentPO; @Mapper public interface CrawlerMapper {     int addCmsContent(CmsContentPO record); } 复制代码

4、配置映射文件CrawlerMapper.xml

在resources下新建mapper文件夹,在mapper下创建CrawlerMapper.xml

<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> <mapper namespace="site.exciter.article.dao.CrawlerMapper">     <insert id="addCmsContent" parameterType="site.exciter.article.model.CmsContentPO">         insert into cms_content (contentId,         title,         date)         values (#{contentId,jdbcType=VARCHAR},         #{title,jdbcType=VARCHAR},         #{date,jdbcType=VARCHAR})     </insert> </mapper> 复制代码

5、配置application.properties

配置数据库和mybatis映射关系。

# mysql spring.datasource.name=mysql spring.datasource.type=com.alibaba.druid.pool.DruidDataSource spring.datasource.driver-class-name=com.mysql.jdbc.Driver spring.datasource.url=jdbc:mysql://10.201.61.184:3306/article?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true spring.datasource.username=root spring.datasource.password=root # druid spring.datasource.druid.initial-size=5 spring.datasource.druid.min-idle=5 spring.datasource.druid.max-active=10 spring.datasource.druid.max-wait=60000 spring.datasource.druid.validation-query=SELECT 1 FROM DUAL spring.datasource.druid.test-on-borrow=false spring.datasource.druid.test-on-return=false spring.datasource.druid.test-while-idle=true spring.datasource.druid.time-between-eviction-runs-millis=60000 spring.datasource.druid.min-evictable-idle-time-millis=300000 spring.datasource.druid.max-evictable-idle-time-millis=600000 # mybatis mybatis.mapperLocations=classpath:mapper/CrawlerMapper.xml 复制代码

6、创建ArticlePageProcessor.java

解析html的逻辑。

package site.exciter.article; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; @Component public class ArticlePageProcessor implements PageProcessor {     private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);     @Override     public void process(Page page) {         String detail_urls_Xpath = "//*[@class='postTitle']/a[@class='postTitle2']/@href";         String next_page_xpath = "//*[@id='nav_next_page']/a/@href";         String next_page_css = "#homepage_top_pager > div:nth-child(1) > a:nth-child(7)";         String title_xpath = "//h1[@class='postTitle']/a/span/text()";         String date_xpath = "//span[@id='post-date']/text()";         page.putField("title", page.getHtml().xpath(title_xpath).toString());         if (page.getResultItems().get("title") == null) {             page.setSkip(true);         }         page.putField("date", page.getHtml().xpath(date_xpath).toString());         if (page.getHtml().xpath(detail_urls_Xpath).match()) {             Selectable detailUrls = page.getHtml().xpath(detail_urls_Xpath);             page.addTargetRequests(detailUrls.all());         }         if (page.getHtml().xpath(next_page_xpath).match()) {             Selectable nextPageUrl = page.getHtml().xpath(next_page_xpath);             page.addTargetRequests(nextPageUrl.all());         } else if (page.getHtml().css(next_page_css).match()) {             Selectable nextPageUrl = page.getHtml().css(next_page_css).links();             page.addTargetRequests(nextPageUrl.all());         }     }     @Override     public Site getSite() {         return site;     } } 复制代码

7、创建ArticlePipeline.java

处理数据的持久化。

package site.exciter.article; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import site.exciter.article.model.CmsContentPO; import site.exciter.article.dao.CrawlerMapper; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import java.util.UUID; @Component public class ArticlePipeline implements Pipeline {     private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline.class);     @Autowired     private CrawlerMapper crawlerMapper;     public void process(ResultItems resultItems, Task task) {         String title = resultItems.get("title");         String date = resultItems.get("date");         CmsContentPO contentPO = new CmsContentPO();         contentPO.setContentId(UUID.randomUUID().toString());         contentPO.setTitle(title);         contentPO.setDate(date);         try {             boolean success = crawlerMapper.addCmsContent(contentPO) > 0;             LOGGER.info("保存成功:{}", title);         } catch (Exception ex) {             LOGGER.error("保存失败", ex);         }     } } 复制代码

8、创建ArticleTask.java

执行抓取任务。

package site.exciter.article; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Spider; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @Component public class ArticleTask {     private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline.class);     @Autowired     private ArticlePipeline articlePipeline;     @Autowired     private ArticlePageProcessor articlePageProcessor;     private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();     public void crawl() {         // 定时任务,每10分钟爬取一次         timer.scheduleWithFixedDelay(() -> {             Thread.currentThread().setName("ArticleCrawlerThread");             try {                 Spider.create(articlePageProcessor)                         .addUrl("http://www.cnblogs.com/dick159/default.html?page=2")                         // 抓取到的数据存数据库                         .addPipeline(articlePipeline)                         // 开启5个线程抓取                         .thread(5)                         // 异步启动爬虫                         .start();             } catch (Exception ex) {                 LOGGER.error("定时抓取数据线程执行异常", ex);             }         }, 0, 10, TimeUnit.MINUTES);     } } 复制代码

9、修改Application

package site.exciter.article; import org.mybatis.spring.annotation.MapperScan; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.CommandLineRunner; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; @SpringBootApplication @MapperScan(basePackages = "site.exciter.article.interface") public class ArticleApplication implements CommandLineRunner {     @Autowired     private ArticleTask articleTask;     public static void main(String[] args) {         SpringApplication.run(ArticleApplication.class, args);     }     @Override     public void run(String... args) throws Exception {         articleTask.crawl();     } } 复制代码

10、执行application,开始抓数据并入库


作者:非空子集
链接:https://juejin.cn/post/7018897037219332104


文章分类
后端
版权声明:本站是系统测试站点,无实际运营。本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 XXXXXXo@163.com 举报,一经查实,本站将立刻删除。
相关推荐