4 Star 2 Fork 3

梦之狼 / a repository store a wechat miniprogram code

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
MainSpider.java 7.56 KB
一键复制 编辑 原始数据 按行查看 历史
import java.net.URL;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
/*
import java.net.Proxy;
import java.net.InetSocketAddress;
import java.net.InetAddress;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
*/
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.File;
import java.io.FileWriter;
import javax.imageio.ImageIO;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.HashMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class MainSpider {
PrintStream out = System.out;
public static void main(String args[]) {
try {
MainSpider spider = new MainSpider();
String url = "https://www.ixiupet.com/gougou/ggmr/22721.html";
FileWriter fw = new FileWriter("E:\\temp\\essay.csv");
spider.wxmlForIxiu(url,fw);
}catch(Exception e) {e.printStackTrace();}
}
void wxmlForIxiu(String url,FileWriter fw) {
ArrayList <String> result = new ArrayList();
List <String> filed = Arrays.asList("pet","type","title","cover","description","time","origin","writer");
try {
String html = InputStream2String(new InputStreamReader(getInfo(url),"gbk"));
Document doc = Jsoup.parse(html);
String cover="";
String description = "";
// select("*") make the Elements become iterable
Elements article = doc.getElementsByClass("inner").select("div.article-content");
ArrayList <String> content = new ArrayList();
for(Element a :article) {
if(cover=="") cover = a.select("img").attr("src");
if(description=="") description = a.select("p").text()
.replaceAll("<a href=(.*) target=\"_blank\">", "")
.replaceAll("<u>", "")
.replaceAll("</u>", "")
.replaceAll("</a>", "");
a.select("img").attr("style", "width:100%;");
a.select("p").attr("style","margin:2px 8px 8px 8px;text-align:justify");
a.select("strong").attr("style","margin-top:4px;margin-bottom:1px");
out.println(a.html());
}
Elements info = doc.getElementsByClass("listltitle");
result.add("\n"+"dog");
result.add("晒宠");
result.add(info.select("h1").text());
result.add(cover);
result.add(description.replace(" ",""));
result.add(info.select("span.spanimg3").text());
result.add("宠物网(ixiupet.com)");
result.add(info.select("span.spanimg1").text().split(":")[1]);
fw.write(String.join(",",filed));
fw.write(String.join(",",result));
fw.close();
}catch(Exception e) {
e.printStackTrace();
}
}
public void saveImages(String url,String path,int pageNum) {
try {
for (int i=1;i<pageNum;i++) {
//InputStreamReader response = new InputStreamReader(getInfo(url+"list_9_"+i+".html",agent.getProxy()),"gbk");
InputStreamReader response = new InputStreamReader(getInfo(url+"list_9_"+i+".html"),"gbk");
System.out.printf(url+"list_9_"+i+".html");
for (Element petList : parserImg(response)) {
//InputStream img = getInfo(petList.attr("src"),agent.getProxy());
InputStream img = getInfo(petList.attr("src"));
String fileName = petList.attr("alt")
.replace("/", "-")
.replace("|", "-")
.replace("\\", "-");
System.out.printf(petList.attr("src")," "+fileName);
File file = new File(path+fileName+".jpg");
file.createNewFile();
ImageIO.write(ImageIO.read(img),"jpg",file);
}
System.out.println("第"+i+"页保存完毕");
}
}catch (Exception e) {
e.printStackTrace();
}
}
String InputStream2String(InputStreamReader in) {
BufferedReader reader = new BufferedReader(in);
StringBuffer sb = new StringBuffer();
String line;
try {
while((line=reader.readLine()) != null) {
sb.append(line);
}
reader.close();
}catch(IOException e) {e.printStackTrace();}
return sb.toString();
}
Elements parserImg(InputStreamReader in) throws IOException{
Document doc = Jsoup.parse(InputStream2String(in));
Elements petList = doc.getElementsByClass("news-main bg").select("div.tiyan-bd-sml").select("img[src]");
in.close();
return petList;
}
ArrayList <String> parserText(InputStreamReader in) {
ArrayList<String> info = new ArrayList();
Document doc = Jsoup.parse(InputStream2String(in));
Elements content = doc.getElementsByClass("c1text3");
for(Element c : content.select("a")) {
info.add(c.text().replaceAll(",", "|"));
}
Elements rate = doc.getElementsByClass("pingjialist");
for (Element r: rate.select("div")) {
try {
String str = r.attr("class").split("start")[1];
if (!str.contains(",")) info.add(str);
else info.add("0");
}catch(ArrayIndexOutOfBoundsException e) {
info.add("0");
}
};
return info;
}
void CsvForBreed(String url,FileWriter writer,int pageNum) {
try {
ArrayList <String> info = new ArrayList();
for (int p=1;p<pageNum;p++) {
Document doc = Jsoup.parse(InputStream2String(
new InputStreamReader(
getInfo(url+"list_9_"+p+".html"),
"gbk")));
Elements petList = doc
.getElementsByClass("tiyan-smll-det");
for (Element pet:petList) {
out.println(pet.child(0).attr("href"));
info = parserText(
new InputStreamReader(
getInfo(pet.child(0).attr("href")),
"gbk"));
String pic = pet.child(0).attr("title")
.replace("/", "-")
.replace("|", "-")
.replace("\\", "-");
info.add("cloud://pethub-database.7065-pethub-database-1301811252/cat/"+pic+".jpg");
info.add("\n");
writer.append(String.join(",", info));
}
info.clear();
out.println("保存好了第"+p+"页");
}
}catch(Exception e) {e.printStackTrace();}
}
public InputStream getInfo(String url) throws MalformedURLException,IOException{
URL link = new URL(url);
// 没找到好的ip代理暂时不用
//String [] ip = p.split(":");
//HttpURLConnection conn = (HttpURLConnection) link.openConnection(proxy);
HttpURLConnection conn = (HttpURLConnection) link.openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Mobile Safari/537.36");
conn.setRequestProperty("Connection", "keep-alive");
conn.setRequestProperty("Accept", "image/webp,image/apng,image/*,*/*;q=0.8");
conn.setRequestProperty("Host","www.ixiupet.com");
conn.setConnectTimeout(3000);
try {
conn.connect();
Thread.sleep(1500);
}catch(Exception e) {
// 没找到好的ip代理暂时不用
//agent.removeProxy(p);
//getInfo(url,agent.getProxy());
}
return conn.getInputStream();
}
}
//GetProxy agent= new GetProxy();
/*
class GetProxy {
List <String> proxyList;
Random random = new Random();
GetProxy(){
this.proxyList= new LinkedList <String>(Arrays.asList(
"121.69.26.14:8080",
"132.145.89.166:3128"));
}
void removeProxy(String p){
this.proxyList.remove(this.proxyList.indexOf(p));
}
String getProxy() {
//return proxyList.get(random.nextInt(proxyList.size()));
String [] ip = proxyList.get(random.nextInt(proxyList.size())).split(":");
System.out.println(ip[0]+ip[1]);
return new Proxy(Proxy.Type.HTTP,
new InetSocketAddress(ip[0],Integer.valueOf(ip[1])));
}
}*/
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/mengzhilang/pethub.git
git@gitee.com:mengzhilang/pethub.git
mengzhilang
pethub
a repository store a wechat miniprogram code
master

搜索帮助

344bd9b3 5694891 D2dac590 5694891