IpProxyPool
No description available
Install / Use
/learn @xwlmdd/IpProxyPoolREADME
项目介绍 基于Spring-boot+Mybatis+redis+webmagic+百度orc开发系统架构。主要爬取的网站有全网代理、西刺代理、米扑代理、快代理。主要可以使用的IP代理分布在米扑和西刺,其他两个比较少可以用。其中米扑代理IP端口是图片,这里使用了百度免费orc识别图片。(有兴趣的同学可以看一下Tess4j技术训练识别类库)
环境搭建 该项目采用maven统一构建,首先创建一个maven项目,在pom.xml文件中以下依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mdd</groupId>
<artifactId>ipproxypool</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>ipProxyPool</name>
<description>Demo project for Spring Boot</description>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.6.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.7</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>1.3.0</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-thymeleaf</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<!--<scope>runtime</scope>-->
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.restdocs</groupId>
<artifactId>spring-restdocs-mockmvc</artifactId>
<scope>test</scope>
</dependency>
<!--<dependency>-->
<!--<groupId>org.springframework.boot</groupId>-->
<!--<artifactId>spring-boot-starter-log42j</artifactId>-->
<!--</dependency>-->
<!--webMagic -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.5.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-lang/commons-lang -->
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.mybatis.generator</groupId>
<artifactId>mybatis-generator-maven-plugin</artifactId>
<version>1.3.2</version>
<configuration>
<verbose>true</verbose>
<overwrite>true</overwrite>
</configuration>
</plugin>
</plugins>
</build>
</project>
编写Application程序启动类
package com.mdd.proxyip;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication
@EnableScheduling
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
}
关键代码如下 使用webmagic获取ip代理详情(全网代理ip分散在不同标签【找规律解析出来】|| 米扑代理ip端口是图片【采用百度orc】)
/**
* 云代理网站ip抓取
* @author xwl 2017.6.3
*/
@Component
public class QuanWanProxyIpCrawler implements PageProcessor {
private Logger logger = Logger.getLogger(QuanWanProxyIpCrawler.class);
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setCycleRetryTimes(3).setRetryTimes(3).setSleepTime(1000)
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0");
public Site getSite() {
return site;
}
public void process(Page page) {
Html html = page.getHtml();
List<String> urlList = page.getHtml().regex("http://www\\.goubanjia\\.com/index\\d*\\.shtml").all();
// 结果集
List<ProxyIp> proxyIpList = new ArrayList<ProxyIp>();
List<String> proxyIpTrList = html.xpath("//*[@id='list']/table/tbody/tr").all();
if (proxyIpTrList != null && proxyIpTrList.size() > 0) {
for (String trHtml : proxyIpTrList) {
ProxyIp proxyIp = new ProxyIp();
String ip = analyzeIp(trHtml);
// ip地址
proxyIp.setProxyIp(ip.substring(0, ip.indexOf(":")));
// 端口号
proxyIp.setProxyPort(Integer.parseInt(ip.substring(ip.indexOf(":") + 1, ip.length())));
analyzeProxyIp(trHtml, proxyIp);
proxyIpList.add(proxyIp);
}
page.putField("proxyIpList", proxyIpList);
}
page.addTargetRequests(urlList);
}
public static void main(String[] args) {
Spider.create(new QuanWanProxyIpCrawler()).addUrl("http://www.goubanjia.com/").thread(5).run();
}
/**
* 该网站ip每个数字分开,没有规律
*
* @param trHtml
* @return
*/
private String analyzeIp(String trHtml) {
trHtml = "<html><head></head><body><table>" + trHtml + "</table></body></html>";
StringBuffer ip = new StringBuffer();
if (StringUtils.isBlank(trHtml)) {
return null;
}
List<Node> nodeList = Jsoup.parse(trHtml).getElementsByTag("td").get(0).childNodes();
if (nodeList == null || nodeList.size() <= 0) {
return null;
}
for (int i = 0; i < nodeList.size() - 1; i++) {
Node node = nodeList.get(i);
String nodeHtml = node.outerHtml();
// display:none;不显示的
if (StringUtils.isNotBlank(nodeHtml) && nodeHtml.contains("display:none;")
|| nodeHtml.contains("display: none;")) {
continue;
}
String text = CommonUtils.simpleMatch(nodeHtml, ">\\s*(.*)\\s*</");
ip.append(text);
}
String resultIp = ip.toString().replace("null", "").replace(" ", "").trim();
String port = CommonUtils.simpleMatch(nodeList.get(nodeList.size() - 1).outerHtml(), ">\\s*(.*)\\s*</");
resultIp = resultIp + ":" + port;
return resultIp;
}
/**
* 解析除了ip、port之外的属性
*
* @param trHtml
* @param proxyIp
* @return
*/
private void analyzeProxyIp(String trHtml, ProxyIp proxyIp) {
trHtml = "<html><head></head><body><table>" + trHtml + "</table></body></html>";
if (StringUtils.isBlank(trHtml)) {
return;
}
try {
Elements tdElements = Jsoup.parse(trHtml).select("tr > td");
if (tdElements == null || tdElements.isEmpty()) {
return;
}
// 匿名度
String anonymity = tdElements.get(1).select("a").text();
// 类型
String proxyType = tdElements.get(2).select("a").text();
// 地址
StringBuffer ipAddress = new StringBuffer();
Elements aElements = tdElements.get(3).select("a");
for (Element a : aElements) {
ipAddress.append(a.text());
}
// 响应速度
String responseSpeed = tdElements.get(5).text();
proxyIp.setAnonymity(anonymity);
proxyIp.setProxyType(proxyType);
proxyIp.setIpAddress(ipAddress.toString());
proxyIp.setResponseSpeed(responseSpeed);
} catch (Exception e) {
logger.debug("解析失败",e);
}
}
}
米扑代理ip端口是图片【采用百度orc】
package com.mdd.proxyip.utils;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpResponse;
import org.apache.log4j.Logger;
import java.io.*;
import java.util.HashMap;
/**
* Created by xwl on 2017/8/20.
*/
public class VCodeCheckUtils {
private static final Logger logger = Logger.getLogger(VCodeCheckUtils.class);
private static String OCRUrl = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic";
// private static String OCRUrl = "https://aip.baidubce.com/rest/2.0/ocr/v1/webimage";
private static final String ACCESS_TOKEN = getAccessToken();
/**
* 获取AccessToken
* 百度开发
* AppId:10028388
* APIKey:kdZU5aOeI7FguVfWzql7LOGM
* SecretKey:Xxcze1I2RLUhB8NFd7T4u4fHdBGundrn
*
* @return
*/
public static String getAccessToken() {
String accessToken = "";
HttpRequestData httpRequestData = new HttpRequestData();
HashMap<String, String> params = new HashMap<>();
params.put("grant_type", "client_credentials");
params.put("client_id", "xxxxx");
params.put("client_secret", "xxxxx");
httpRequestData.setRequestMethod("GET");
httpRequestData.setParams(params);
httpRe
Related Skills
node-connect
352.0kDiagnose OpenClaw node connection and pairing failures for Android, iOS, and macOS companion apps
frontend-design
111.1kCreate distinctive, production-grade frontend interfaces with high design quality. Use this skill when the user asks to build web components, pages, or applications. Generates creative, polished code that avoids generic AI aesthetics.
openai-whisper-api
352.0kTranscribe audio via OpenAI Audio Transcriptions API (Whisper).
qqbot-media
352.0kQQBot 富媒体收发能力。使用 <qqmedia> 标签,系统根据文件扩展名自动识别类型(图片/语音/视频/文件)。
