博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
httpclient爬取性感美图
阅读量:5277 次
发布时间:2019-06-14

本文共 6883 字,大约阅读时间需要 22 分钟。

依赖httpclient4.2,Jsop

SemeiziCrawler.java

package kidbei.learn.crawler; import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.io.StringWriter;import java.util.ArrayList;import java.util.Iterator;import java.util.List; import org.apache.commons.io.IOUtils;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** * http://sejie.wanxun.org/post/2012-09-25/40039413449 * @author Administrator * */public class SemeiziCrawler {    private static final String BASEHOST = "http://sejie.wanxun.org/";    private static DefaultHttpClient client = ConnectionManager.getHttpClient();    static String url = "http://sejie.wanxun.org/post/2012-09-25/40039413449";    private static String IMGPATH = "D:\\sexpicture\\色戒美眉图"+File.separator+StringUtil.getDate();    static int STARTPAGE = 1;    static int PAGECOUNT = 100;     public static void main(String[] args) {        File f = new File(IMGPATH);        if(!f.exists()){            f.mkdirs();        }        String host = BASEHOST ;        for(int i=STARTPAGE;i
articleURLS = getArticleURL(pageContext); for(String articleURL:articleURLS){ String articleContext = getResultByUrl(articleURL); List
ImgURLS = getImgURLS(articleContext); for(String ImgURL:ImgURLS){ savepic(ImgURL); } } }// String articleContext = getResultByUrl(url);// List
strs = getImgURLS(articleContext);// for(String str:strs){// System.out.println(str);// } } /** * 根据url获取页面 * @param url * @return */ public static String getResultByUrl(String url){ System.out.println("打开网页"+url); HttpGet get = new HttpGet(url); HttpEntity entity = null; HttpResponse response = null; try { response = client.execute(get); entity = response.getEntity(); if(entity != null){ InputStream is = entity.getContent(); StringWriter sw = new StringWriter(); IOUtils.copy(is, sw, "UTF-8"); is.close(); sw.close(); return sw.toString(); } } catch (Exception e) { System.out.println("网页打开出错"); return null; }finally{ get.abort(); try { EntityUtils.consume(entity); } catch (IOException e) { e.printStackTrace(); } } return null; } /** * 找出当前页面中所有帖子的地址 * @param pageStr 网页字符串 * @return */ public static List
getArticleURL(String pageContext){ if(pageContext == null){ return null; } List
articleURLS = new ArrayList
(); System.out.println("寻找帖子..........."); try { Document doc = Jsoup.parseBodyFragment(pageContext); Elements es = doc.select("div.post"); es = es.select("div[class=post-item type-photo]"); es = es.select("div.meta a:containsOwn(全文)"); for(Element e:es){ articleURLS.add(e.attr("href")); } } catch (Exception e) { e.printStackTrace(); return null; } return articleURLS; } /** * 获取帖子的图片地址 * @param articleURLS * @return */ public static List
getImgURLS(String articleContext){ List
ImgURLS = new ArrayList
(); if(articleContext == null){ return null; } System.out.println("获取图片地址-----------"); Document doc = Jsoup.parse(articleContext); Elements es = doc.select("a[target=_blank] img[src]"); for(Iterator
i=es.iterator();i.hasNext();){ Element e = i.next(); ImgURLS.add(e.attr("src")); } return ImgURLS; } /** * 保存图片 * @param ImgURL */ public static void savepic(String ImgURL){ if(ImgURL == null){ return ; } HttpGet get = new HttpGet(ImgURL); String[] strs = ImgURL.split("/"); String fileName = strs[strs.length-1]; String savePath = IMGPATH+File.separator+fileName; HttpEntity entity = null; try { HttpResponse response = client.execute(get); entity = response.getEntity(); System.out.println("保存图片>>>>.>>>>>>"+fileName); InputStream is = entity.getContent(); OutputStream os = new FileOutputStream(savePath); IOUtils.copy(is, os); IOUtils.closeQuietly(os); IOUtils.closeQuietly(is); } catch (Exception e) { e.printStackTrace(); System.out.println("图片保存失败"); return ; } }}

 StringUtil.java 

package kidbei.learn.crawler; import java.io.File;import java.text.SimpleDateFormat;import java.util.Date;import java.util.Random; public class StringUtil {    public static String getRandomString(){        StringBuffer generateRandStr = new StringBuffer();         Random rand = new Random();         int length = 6;         char ch;        for(int i=0;i

ConnectionManager.java

package kidbei.learn.crawler; import org.apache.http.conn.scheme.PlainSocketFactory;import org.apache.http.conn.scheme.Scheme;import org.apache.http.conn.scheme.SchemeRegistry;import org.apache.http.conn.ssl.SSLSocketFactory;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.impl.conn.PoolingClientConnectionManager;import org.apache.http.params.BasicHttpParams;import org.apache.http.params.CoreConnectionPNames;import org.apache.http.params.CoreProtocolPNames;import org.apache.http.params.HttpParams; public class ConnectionManager {    static final int TIMEOUT = 20000;//连接超时时间    static final int SO_TIMEOUT = 20000;//数据传输超时    static String UA = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1" +            " (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1";         public static DefaultHttpClient getHttpClient(){        SchemeRegistry schemeRegistry = new SchemeRegistry();        schemeRegistry.register(                new Scheme("http",80,PlainSocketFactory.getSocketFactory()));        schemeRegistry.register(                new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));                 PoolingClientConnectionManager  cm = new PoolingClientConnectionManager(schemeRegistry);        cm.setMaxTotal(500);        cm.setDefaultMaxPerRoute(200);                 HttpParams params = new BasicHttpParams();        params.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,TIMEOUT);        params.setParameter(CoreConnectionPNames.SO_TIMEOUT, SO_TIMEOUT);        params.setParameter(CoreProtocolPNames.USER_AGENT, UA);                 DefaultHttpClient client = new DefaultHttpClient(cm,params);        return client;    }}

本文转自:http://www.oschina.net/code/snippet_257479_14524#23843

转载于:https://www.cnblogs.com/dreammyle/p/4149687.html

你可能感兴趣的文章
java DecimalFormat
查看>>
简单两步快速学会使用Mybatis-Generator自动生成entity实体、dao接口和简单mapper映射(用mysql和oracle举例)...
查看>>
Spring读书笔记-----Spring核心机制:依赖注入
查看>>
如何挂载阿里云的数据盘
查看>>
block extends include三者的差别跟用法
查看>>
服务器安全
查看>>
系统学习qsort1 尤其partition
查看>>
yield生成器对象返回Fiabs元素 分类: python 小练习 ...
查看>>
HDU 1001 Sum Problem
查看>>
BZOJ 1196 [HNOI2006]公路修建问题(二分答案+并查集)
查看>>
Android学习笔记1:初识框架
查看>>
bzoj 2005
查看>>
杜教筛模板
查看>>
浅谈委托事件
查看>>
装箱问题
查看>>
C++:重载全局new/delete实现跨平台多线程内存检测
查看>>
文法解释修改
查看>>
Spring中的@Controller和 @RestController 的区别以及@ReqeustMapping的作用
查看>>
ajax不跳转页面的快速删除操作,可添加美观样式
查看>>
使用HTML语言和CSS开发商业站点(7)
查看>>