黄色网址大全免费-黄色网址你懂得-黄色网址你懂的-黄色网址有那些-免费超爽视频-免费大片黄国产在线观看

專注Java教育14年 全國咨詢/投訴熱線:400-8080-105
動力節(jié)點LOGO圖
始于2009,口口相傳的Java黃埔軍校
首頁 hot資訊 封裝Java爬蟲工具類

封裝Java爬蟲工具類

更新時間:2021-09-22 11:12:00 來源:動力節(jié)點 瀏覽1034次

封裝了一個JAVA爬蟲工具類。

1.maven引用jar

   <dependency>
		<groupId>net.sourceforge.htmlunit</groupId>
		<artifactId>htmlunit</artifactId>
		<version>2.27</version>
	</dependency>
	<dependency>
		<groupId>org.jsoup</groupId>
		<artifactId>jsoup</artifactId>
		<version>1.8.3</version>
	</dependency>    

2.工具類

  public class HttpHtmlUnit {
	/**
	 * 請求超時時間,默認20000ms
	 */
	private int timeout = 20000;
	/**
	 * 等待異步JS執(zhí)行時間,默認20000ms
	 */
	private int waitForBackgroundJavaScript = 20000;
	/**
	 * cookie表
	 */
	private Map<String, String> cookieMap = new HashMap<>();
/**
 * 請求編碼(處理返回結(jié)果),默認UTF-8
 */
private String charset = "UTF-8";
private static HttpHtmlUnit httpUtils;
private HttpHtmlUnit() {
}
/**
 * 獲取實例
 *
 * @return
 */
public static HttpHtmlUnit getInstance() {
	if (httpUtils == null)
		httpUtils = new HttpHtmlUnit();
	return httpUtils;
}
/**
 * 清空cookieMap
 */
public void invalidCookieMap() {
	cookieMap.clear();
}
public int getTimeout() {
	return timeout;
}
/**
 * 設(shè)置請求超時時間
 *
 * @param timeout
 */
public void setTimeout(int timeout) {
	this.timeout = timeout;
}
public String getCharset() {
	return charset;
}
/**
 * 設(shè)置請求字符編碼集
 *
 * @param charset
 */
public void setCharset(String charset) {
	this.charset = charset;
}
public int getWaitForBackgroundJavaScript() {
	return waitForBackgroundJavaScript;
}
/**
 * 設(shè)置獲取完整HTML頁面時等待異步JS執(zhí)行的時間
 *
 * @param waitForBackgroundJavaScript
 */
public void setWaitForBackgroundJavaScript(int waitForBackgroundJavaScript) {
	this.waitForBackgroundJavaScript = waitForBackgroundJavaScript;
}
/**
 * 將網(wǎng)頁返回為解析后的文檔格式
 * 
 * @param html
 * @return
 * @throws Exception
 */
public static Document parseHtmlToDoc(String html) throws Exception {
	return removeHtmlSpace(html);
}
private static Document removeHtmlSpace(String str) {
	Document doc = Jsoup.parse(str);
	String result = doc.html().replace("&nbsp;", "");
	return Jsoup.parse(result);
}
/**
 * 執(zhí)行g(shù)et請求,返回doc
 *
 * @param url
 * @return
 * @throws Exception
 */
public Document executeGetAsDocument(String url) throws Exception {
	return parseHtmlToDoc(executeGet(url));
}
/**
 * 執(zhí)行g(shù)et請求
 *
 * @param url
 * @return
 * @throws Exception
 */
public String executeGet(String url) throws Exception {
	HttpGet httpGet = new HttpGet(url);
	httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
	httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpClient httpClient = null;
	String str = "";
	try {
		httpClient = HttpClientBuilder.create().build();
		HttpClientContext context = HttpClientContext.create();
		CloseableHttpResponse response = httpClient.execute(httpGet, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		int state = response.getStatusLine().getStatusCode();
		if (state == 404) {
			str = "";
		}
		try {
			HttpEntity entity = response.getEntity();
			if (entity != null) {
				str = EntityUtils.toString(entity, charset);
			}
		} finally {
			response.close();
		}
	} catch (IOException e) {
		throw e;
	} finally {
		try {
			if (httpClient != null)
				httpClient.close();
		} catch (IOException e) {
			throw e;
		}
	}
	return str;
}
/**
 * 用https執(zhí)行g(shù)et請求,返回doc
 *
 * @param url
 * @return
 * @throws Exception
 */
public Document executeGetWithSSLAsDocument(String url) throws Exception {
	return parseHtmlToDoc(executeGetWithSSL(url));
}
/**
 * 用https執(zhí)行g(shù)et請求
 *
 * @param url
 * @return
 * @throws Exception
 */
public String executeGetWithSSL(String url) throws Exception {
	HttpGet httpGet = new HttpGet(url);
	httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
	httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpClient httpClient = null;
	String str = "";
	try {
		httpClient = createSSLInsecureClient();
		HttpClientContext context = HttpClientContext.create();
		CloseableHttpResponse response = httpClient.execute(httpGet, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		int state = response.getStatusLine().getStatusCode();
		if (state == 404) {
			str = "";
		}
		try {
			HttpEntity entity = response.getEntity();
			if (entity != null) {
				str = EntityUtils.toString(entity, charset);
			}
		} finally {
			response.close();
		}
	} catch (IOException e) {
		throw e;
	} catch (GeneralSecurityException ex) {
		throw ex;
	} finally {
		try {
			if (httpClient != null)
				httpClient.close();
		} catch (IOException e) {
			throw e;
		}
	}
	return str;
}
/**
 * 執(zhí)行post請求,返回doc
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public Document executePostAsDocument(String url, Map<String, String> params) throws Exception {
	return parseHtmlToDoc(executePost(url, params));
}
/**
 * 執(zhí)行post請求
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public String executePost(String url, Map<String, String> params) throws Exception {
	String reStr = "";
	HttpPost httpPost = new HttpPost(url);
	httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
	List<NameValuePair> paramsRe = new ArrayList<>();
	for (Map.Entry<String, String> entry : params.entrySet()) {
		paramsRe.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
	}
	CloseableHttpClient httpclient = HttpClientBuilder.create().build();
	CloseableHttpResponse response;
	try {
		httpPost.setEntity(new UrlEncodedFormEntity(paramsRe));
		HttpClientContext context = HttpClientContext.create();
		response = httpclient.execute(httpPost, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		HttpEntity entity = response.getEntity();
		reStr = EntityUtils.toString(entity, charset);
	} catch (IOException e) {
		throw e;
	} finally {
		httpPost.releaseConnection();
	}
	return reStr;
}
/**
 * 用https執(zhí)行post請求,返回doc
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public Document executePostWithSSLAsDocument(String url, Map<String, String> params) throws Exception {
	return parseHtmlToDoc(executePostWithSSL(url, params));
}
/**
 * 用https執(zhí)行post請求
 *
 * @param url
 * @param params
 * @return
 * @throws Exception
 */
public String executePostWithSSL(String url, Map<String, String> params) throws Exception {
	String re = "";
	HttpPost post = new HttpPost(url);
	List<NameValuePair> paramsRe = new ArrayList<>();
	for (Map.Entry<String, String> entry : params.entrySet()) {
		paramsRe.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
	}
	post.setHeader("Cookie", convertCookieMapToString(cookieMap));
	post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpResponse response;
	try {
		CloseableHttpClient httpClientRe = createSSLInsecureClient();
		HttpClientContext contextRe = HttpClientContext.create();
		post.setEntity(new UrlEncodedFormEntity(paramsRe));
		response = httpClientRe.execute(post, contextRe);
		HttpEntity entity = response.getEntity();
		if (entity != null) {
			re = EntityUtils.toString(entity, charset);
		}
		getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
	} catch (Exception e) {
		throw e;
	}
	return re;
}
/**
 * 發(fā)送JSON格式body的POST請求
 *
 * @param url 地址
 * @param jsonBody json body
 * @return
 * @throws Exception
 */
public String executePostWithJson(String url, String jsonBody) throws Exception {
	String reStr = "";
	HttpPost httpPost = new HttpPost(url);
	httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
	CloseableHttpClient httpclient = HttpClientBuilder.create().build();
	CloseableHttpResponse response;
	try {
		httpPost.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
		HttpClientContext context = HttpClientContext.create();
		response = httpclient.execute(httpPost, context);
		getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
		HttpEntity entity = response.getEntity();
		reStr = EntityUtils.toString(entity, charset);
	} catch (IOException e) {
		throw e;
	} finally {
		httpPost.releaseConnection();
	}
	return reStr;
}
/**
 * 發(fā)送JSON格式body的SSL POST請求
 *
 * @param url 地址
 * @param jsonBody json body
 * @return
 * @throws Exception
 */
public String executePostWithJsonAndSSL(String url, String jsonBody) throws Exception {
	String re = "";
	HttpPost post = new HttpPost(url);
	post.setHeader("Cookie", convertCookieMapToString(cookieMap));
	post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
	CloseableHttpResponse response;
	try {
		CloseableHttpClient httpClientRe = createSSLInsecureClient();
		HttpClientContext contextRe = HttpClientContext.create();
		post.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
		response = httpClientRe.execute(post, contextRe);
		HttpEntity entity = response.getEntity();
		if (entity != null) {
			re = EntityUtils.toString(entity, charset);
		}
		getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
	} catch (Exception e) {
		throw e;
	}
	return re;
}
/**
 * 獲取頁面文檔字串(等待異步JS執(zhí)行)
 *
 * @param url 頁面URL
 * @return
 * @throws Exception
 */
public String getHtmlPageResponse(String url) throws Exception {
	String result = "";
	final WebClient webClient = new WebClient(BrowserVersion.CHROME);
	webClient.getOptions().setThrowExceptionOnScriptError(false);//當JS執(zhí)行出錯的時候是否拋出異常
	webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//當HTTP的狀態(tài)非200時是否拋出異常
	webClient.getOptions().setActiveXNative(true);
	webClient.getOptions().setCssEnabled(true);//是否啟用CSS
	webClient.getOptions().setJavaScriptEnabled(true); //很重要,啟用JS
	webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,設(shè)置支持AJAX
	webClient.getOptions().setTimeout(timeout);//設(shè)置“瀏覽器”的請求超時時間
	webClient.setJavaScriptTimeout(timeout);//設(shè)置JS執(zhí)行的超時時間
	HtmlPage page;
	try {
		page = webClient.getPage(url);
	} catch (Exception e) {
		webClient.close();
		throw e;
	}
	webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//該方法阻塞線程
	result = page.asXml();
	webClient.close();
	return result;
}
/**
 * 獲取頁面文檔Document對象(等待異步JS執(zhí)行)
 *
 * @param url 頁面URL
 * @return
 * @throws Exception
 */
public Document getHtmlPageResponseAsDocument(String url) throws Exception {
	return parseHtmlToDoc(getHtmlPageResponse(url));
}
private void getCookiesFromCookieStore(CookieStore cookieStore, Map<String, String> cookieMap) {
	List<Cookie> cookies = cookieStore.getCookies();
	for (Cookie cookie : cookies) {
		cookieMap.put(cookie.getName(), cookie.getValue());
	}
}
private String convertCookieMapToString(Map<String, String> map) {
	String cookie = "";
	for (Map.Entry<String, String> entry : map.entrySet()) {
		cookie += (entry.getKey() + "=" + entry.getValue() + "; ");
	}
	if (map.size() > 0) {
		cookie = cookie.substring(0, cookie.length() - 2);
	}
	return cookie;
}
/**
 * 創(chuàng)建 SSL連接
 *
 * @return
 * @throws GeneralSecurityException
 */
private static CloseableHttpClient createSSLInsecureClient() throws GeneralSecurityException {
	try {
		SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, (chain, authType) -> true).build();
		SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(sslContext,
				(s, sslContextL) -> true);
		return HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).build();
	} catch (GeneralSecurityException e) {
		throw e;
	}
}

3.遇到的問題:

htmlunit引用common-io的版本較低。如果項目中其它地方有應(yīng)用common-io較高版本,版本沖突會導(dǎo)致問題。處理版本沖突可參照maven引用依賴原則,pom文件中放置在較前位置的版本先被引用到處理。

Java開發(fā)工具有很多,大家以后可以慢慢了解,有些工具是比較常用的,大家可要掌握哦。

提交申請后,顧問老師會電話與您溝通安排學(xué)習(xí)

免費課程推薦 >>
技術(shù)文檔推薦 >>
主站蜘蛛池模板: 黄a大片| 欧美一级性视频 | 黄色毛片大全 | 欧美性xxx极品hd高清 | 一本高清 | 怡红院成人永久免费看 | 毛片毛多| 日韩欧美视频二区 | 国产1区二区| 精品国产高清自在线一区二区三区 | 99精品在线 | 午夜视频免费看 | 久久亚洲网站 | 国产日本欧美亚洲精品视 | 在线观看成人小视频 | 欧美成a高清在线观看www | 国产精品免费观在线 | 天天爽夜夜爽人人爽曰喷水 | 黄大色黄美女精品大毛片 | 日韩专区在线 | 免费成年人 | 一级特黄aaaaaa大片 | 日韩黄色免费观看 | 免费一级黄色录像 | 人成午夜性刺激免费 | 国产成人午夜精品影院游乐网 | 久草新在线 | 国产中文99视频在线观看 | 一级做a爱片特黄在线观看 一级做a爱片就在线看 | 亚洲精品第一页 | 一本久道久久综合中文字幕 | 国产 日韩 欧美 综合 | 看黄的视频 | 国内精品伊人久久久影视 | 男人和女人日皮视频 | 嗯啊使劲用力在线观看视频 | 网址你懂的在线观看 | 国产v亚洲v天堂a无 国产v亚洲v天堂无码 | 国产三级成人 | 无遮挡一级毛片 | 国产精品日韩专区 |