package com.chao.crawler;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.chao.util.ListUtil;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
public class PageProcesserProduct implements PageProcessor {
// private Site site = null;
//
// public PageProcesser(String domain, String startUrl) {
//
// site = Site.me().setDomain(domain).addStartUrl(startUrl);
//
// }
//调试用
private Site site = Site.me().setDomain("http://www.babysittersnow.com")
.addStartUrl("http://www.babysittersnow.com.au/babysitters/search");
@Override
public void process(Page page) {
//System.out.println(page.getUrl());
String Title= page.getHtml().xpath("//div[@class='profile-panel-main']/h1").toString().replaceAll("<[^>]*>", "");;
page.putField("Title",Title);
String Info=page.getHtml().xpath("//div[@class='profile-panel-details']").toString().replaceAll("<[^>]*>", "");;
page.putField("Info",Info);
String Review=page.getHtml().xpath("//div[@class='review']/p").toString();
page.putField("Review",Review);
String Introduction=page.getHtml().xpath("//div[@id='profile-tab-introduction']").toString().replaceAll("<[^>]*>", "");;
page.putField("Introduction",Introduction);
String Details=page.getHtml().xpath("//div[@id='profile-tab-details']").toString().replaceAll("<[^>]*>", "");;
page.putField("Details",Details);
String Insights=page.getHtml().xpath("//div[@id='profile-tab-insights']").toString().replaceAll("<[^>]*>", "");;
page.putField("Insights",Insights);
System.out.println("商品筛选完毕,准备执行存储");
// page.putField("author", page.getHtml().$("div.Resume").toString());
// page.putField("info", page.getHtml().xpath("//p[@class='profile-panel-details']/p/label/text()").toString());
Product product = new Product();
product.setTitle(Title);
product.setInfo(Info);
product.setReview(Review);
product.setIntroduction(Introduction);
product.setDetails(Details);
product.setInsights(Insights);
page.putField("product", product);
System.out.println("----------------------------------------------------");
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new PageProcesserProduct())
.pipeline(new ConsolePipeline()).thread(10).run();
}
到此这篇java的爬虫(java爬虫步骤)的文章就 介绍到这了,更多相关内容请继续浏览下面的相关 推荐文章,希望大家都能在 编程的领域有一番成就!版权声明:
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。
如若内容造成侵权、违法违规、事实不符,请将相关资料发送至xkadmin@xkablog.com进行投诉反馈,一经查实,立即处理!
转载请注明出处,原文链接:https://www.xkablog.com/jjc/21830.html