java爬图片数据 demo

    技术2022-07-16  105

    package com.xcx.spots.test;

    import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; import us.codecraft.webmagic.scheduler.QueueScheduler;

    import us.codecraft.webmagic.selector.Selectable; public class Danli implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000); @Override public void process(Page page) { //如果页面是列表页 if (page.getUrl().toString().equals("http://www.mmonly.cc/ktmh/dmmn/")) { //得到列表图片的所有链接 page.addTargetRequests(page.getHtml().$("div.item_t > div > div.ABox > a").links().all()); // 获取下一页,倒数第个a标签 // 规定属于其父元素的第二个子元素的每个 p 元素,从最后一个子元素开始计数: //这样就拿到了下一页的link page.addTargetRequest(page.getHtml().$("#pageNum > a:nth-last-child(2)").links().toString()); //详情页 匹配括号内的数字 } else if (page.getUrl().regex("http://www.mmonly.cc/ktmh/dmmn/[\\d]+") != null) { // 得到详情页里面的下一页按钮的 href Selectable links = page.getHtml().$("#nl > a").links(); if (links != null ) page.addTargetRequest(links.toString()); // 抓取图片内容p标签 String img = page.getHtml().$("#big-pic p img").toString(); //细查a标签 if (img == "null") //img标签直接使用link()不能直接获取,所以只能获取到内容,进行内容的截取 img = page.getHtml().$("#big-pic a img").toString(); //截取从s开始计,+5就刚好到h的位置 img = img.substring(img.indexOf("src=\"") + 5, img.length() - 2); page.putField("img", img); } } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new Danli()) .addUrl("http://www.mmonly.cc/ktmh/dmmn/") .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000))) .addPipeline(new Demo()).thread(5).run(); }

    } package com.xcx.spots.test;

    /**

    @ProjectName: spots

    @Package: com.xcx.spots.test

    @ClassName: Demo

    @Author: nh

    @Description:

    @Date: 2020/7/2 13:48

    @Version: 1.0 */ import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; public class Demo implements Pipeline { @Override public void process(ResultItems resultItems, Task task) {

    String url = resultItems.get("img").toString(); UrlFileDownloadUtil.downloadPicture(url);

    } } package com.xcx.spots.test;

    /**

    @ProjectName: spots@Package: com.xcx.spots.test@ClassName: UrlFileDownloadUtil@Author: nh@Description:@Date: 2020/7/2 13:48@Version: 1.0 / import java.io.; import java.net.URL; import java.util.UUID;

    public class UrlFileDownloadUtil {

    public static void downloadPicture(String url) { String file = "D:\\image\\"; try { URL u = new URL(url); String name = UUID.randomUUID().toString(); DataInputStream inputStream = new DataInputStream(u.openStream()); FileOutputStream outputStream = new FileOutputStream(file+name+".jpg"); byte [] bytes = new byte[1024*100]; int length ; while ((length=inputStream.read(bytes))>0){ outputStream.write(bytes,0,length); } System.out.println("下载完成:"+file+name+".jpg"); inputStream.close(); outputStream.close(); } catch ( Exception e) { e.printStackTrace(); } }

    }

    Processed: 0.018, SQL: 9