依赖httpclient4.2,Jsop
SemeiziCrawler.java
package kidbei.learn.crawler; import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.io.StringWriter;import java.util.ArrayList;import java.util.Iterator;import java.util.List; import org.apache.commons.io.IOUtils;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/** * http://sejie.wanxun.org/post/2012-09-25/40039413449 * @author Administrator * */public class SemeiziCrawler { private static final String BASEHOST = "http://sejie.wanxun.org/"; private static DefaultHttpClient client = ConnectionManager.getHttpClient(); static String url = "http://sejie.wanxun.org/post/2012-09-25/40039413449"; private static String IMGPATH = "D:\\sexpicture\\色戒美眉图"+File.separator+StringUtil.getDate(); static int STARTPAGE = 1; static int PAGECOUNT = 100; public static void main(String[] args) { File f = new File(IMGPATH); if(!f.exists()){ f.mkdirs(); } String host = BASEHOST ; for(int i=STARTPAGE;iarticleURLS = getArticleURL(pageContext); for(String articleURL:articleURLS){ String articleContext = getResultByUrl(articleURL); List ImgURLS = getImgURLS(articleContext); for(String ImgURL:ImgURLS){ savepic(ImgURL); } } }// String articleContext = getResultByUrl(url);// List strs = getImgURLS(articleContext);// for(String str:strs){// System.out.println(str);// } } /** * 根据url获取页面 * @param url * @return */ public static String getResultByUrl(String url){ System.out.println("打开网页"+url); HttpGet get = new HttpGet(url); HttpEntity entity = null; HttpResponse response = null; try { response = client.execute(get); entity = response.getEntity(); if(entity != null){ InputStream is = entity.getContent(); StringWriter sw = new StringWriter(); IOUtils.copy(is, sw, "UTF-8"); is.close(); sw.close(); return sw.toString(); } } catch (Exception e) { System.out.println("网页打开出错"); return null; }finally{ get.abort(); try { EntityUtils.consume(entity); } catch (IOException e) { e.printStackTrace(); } } return null; } /** * 找出当前页面中所有帖子的地址 * @param pageStr 网页字符串 * @return */ public static List getArticleURL(String pageContext){ if(pageContext == null){ return null; } List articleURLS = new ArrayList (); System.out.println("寻找帖子..........."); try { Document doc = Jsoup.parseBodyFragment(pageContext); Elements es = doc.select("div.post"); es = es.select("div[class=post-item type-photo]"); es = es.select("div.meta a:containsOwn(全文)"); for(Element e:es){ articleURLS.add(e.attr("href")); } } catch (Exception e) { e.printStackTrace(); return null; } return articleURLS; } /** * 获取帖子的图片地址 * @param articleURLS * @return */ public static List getImgURLS(String articleContext){ List ImgURLS = new ArrayList (); if(articleContext == null){ return null; } System.out.println("获取图片地址-----------"); Document doc = Jsoup.parse(articleContext); Elements es = doc.select("a[target=_blank] img[src]"); for(Iterator i=es.iterator();i.hasNext();){ Element e = i.next(); ImgURLS.add(e.attr("src")); } return ImgURLS; } /** * 保存图片 * @param ImgURL */ public static void savepic(String ImgURL){ if(ImgURL == null){ return ; } HttpGet get = new HttpGet(ImgURL); String[] strs = ImgURL.split("/"); String fileName = strs[strs.length-1]; String savePath = IMGPATH+File.separator+fileName; HttpEntity entity = null; try { HttpResponse response = client.execute(get); entity = response.getEntity(); System.out.println("保存图片>>>>.>>>>>>"+fileName); InputStream is = entity.getContent(); OutputStream os = new FileOutputStream(savePath); IOUtils.copy(is, os); IOUtils.closeQuietly(os); IOUtils.closeQuietly(is); } catch (Exception e) { e.printStackTrace(); System.out.println("图片保存失败"); return ; } }}
StringUtil.java
package kidbei.learn.crawler; import java.io.File;import java.text.SimpleDateFormat;import java.util.Date;import java.util.Random; public class StringUtil { public static String getRandomString(){ StringBuffer generateRandStr = new StringBuffer(); Random rand = new Random(); int length = 6; char ch; for(int i=0;i
ConnectionManager.java
package kidbei.learn.crawler; import org.apache.http.conn.scheme.PlainSocketFactory;import org.apache.http.conn.scheme.Scheme;import org.apache.http.conn.scheme.SchemeRegistry;import org.apache.http.conn.ssl.SSLSocketFactory;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.impl.conn.PoolingClientConnectionManager;import org.apache.http.params.BasicHttpParams;import org.apache.http.params.CoreConnectionPNames;import org.apache.http.params.CoreProtocolPNames;import org.apache.http.params.HttpParams; public class ConnectionManager { static final int TIMEOUT = 20000;//连接超时时间 static final int SO_TIMEOUT = 20000;//数据传输超时 static String UA = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1" + " (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"; public static DefaultHttpClient getHttpClient(){ SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register( new Scheme("http",80,PlainSocketFactory.getSocketFactory())); schemeRegistry.register( new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry); cm.setMaxTotal(500); cm.setDefaultMaxPerRoute(200); HttpParams params = new BasicHttpParams(); params.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,TIMEOUT); params.setParameter(CoreConnectionPNames.SO_TIMEOUT, SO_TIMEOUT); params.setParameter(CoreProtocolPNames.USER_AGENT, UA); DefaultHttpClient client = new DefaultHttpClient(cm,params); return client; }}
本文转自:http://www.oschina.net/code/snippet_257479_14524#23843