Commit 235218a7 authored by 盖献康's avatar 盖献康

爬取网站demo

parent a0452c60
......@@ -62,6 +62,12 @@
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>4.0.0</version> <!-- 确保使用最新版本 -->
</dependency>
</dependencies>
</project>
......@@ -159,17 +159,10 @@ public class Application {
Document document = Jsoup.connect(article.getLink()).get();
Element entiryElement = document.getElementById("img-content");
if (entiryElement != null) {
String articleTitle = Objects.requireNonNull(entiryElement.select("#activity-name")).text();
System.out.println("标题---" + articleTitle + "----");
Element mainContent = entiryElement.getElementById("js_content");
assert mainContent != null;
System.out.println("内容---" + mainContent.text() + "----");
Elements imgs = mainContent.getElementsByTag("img");
for (Element img : imgs) {
System.out.println("图片---" + img.attr("data-src"));
}
System.out.println("--内容h5" + entiryElement.html());
}
}
System.out.println("---条数:"+ exList.size());
}
}
......
package top.iszsq.weixin;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 爬取网站所用demo
* @author gxk
* https://www.iachina.cn/col/col23/index.html
*/
public class Website {
public static String url = "https://www.iachina.cn/col/col23/index.html";
public static String date = "20234-01-01";
public static void main(String[] args) throws IOException {
// 通过网址获取Document信息
Document document = Jsoup.connect(url).get();
String html = document.html();
// 正则拿取隐藏的信息
String regexDiv = "<record><!\\[CDATA\\[(.*?)]]></record>";
Pattern pattern = Pattern.compile(regexDiv, Pattern.DOTALL);
Matcher matcher = pattern.matcher(html);
List<String> list = new ArrayList<>();
// 把本年的文章拿取
while (matcher.find()) {
String group = matcher.group(1);
String nowDate = Jsoup.parse(group).select("div.date").text();
int i = date.compareTo(nowDate);
if (i < 0) {
list.add(matcher.group(1));
}
}
for (String item : list) {
String articleUrl = Jsoup.parse(item).getElementsByTag("a").attr("href");
if (articleUrl.contains("mp.weixin.qq.com")) {
String weiXin = getWeiXin(articleUrl);
System.out.println("weixin--" + weiXin);
} else if (articleUrl.contains("www.cs.com.cn")) {
String cscom = getCsComCn(articleUrl);
System.out.println("cscom--" + cscom);
} else if (articleUrl.contains("www.cbimc.cn")) {
String cbimc = getCbimcCn(articleUrl);
System.out.println("cbimc--" + cbimc);
} else if (articleUrl.contains("www.news.cn")) {
String newscn = getNewsCn(articleUrl);
System.out.println("newscn--" + newscn);
} else if (articleUrl.contains("e-chinalife.com")) {
String chinalife = getChinalife(articleUrl);
System.out.println("chinalife--" + chinalife);
} else if (articleUrl.contains("property.picc.com")) {
String picccom = getPiccCom(articleUrl);
System.out.println("picccom--" + picccom);
} else if (articleUrl.contains("finance.cnr.cn")) {
String financecnr = getFinanceCnr(articleUrl);
System.out.println("financecnr--" + financecnr);
} else if (articleUrl.contains("www.hsbcinsurance.com.cn")) {
String hsbcinsurance = getHsbcinsurance(articleUrl);
System.out.println("hsbcinsurance--" + hsbcinsurance);
} else if (articleUrl.contains("finance.people.com.cn")) {
String financepeople = getFinancePeople(articleUrl);
System.out.println("financepeople--" + financepeople);
} else if (articleUrl.contains("finance.china.com.cn")) {
String financechina = getFinanceChina(articleUrl);
System.out.println("financechina--" + financechina);
}
}
System.out.println(list.size());
}
/**
* mp.weixin.qq.com
* @param url
* @return
* @throws IOException
*/
public static String getWeiXin(String url) throws IOException {
Document document = Jsoup.connect(url).get();
String html = Objects.requireNonNull(document.getElementById("img-content")).html();
return html.replaceAll("data-src", "src");
}
/**
* www.cs.com.cn
* @param url
* @return
*/
public static String getCsComCn(String url) throws IOException {
Document document = Jsoup.connect(url).get();
Elements select = document.select("article.cont_article");
return select.html();
}
/**
* www.cbimc.cn
* @param url
* @return
*/
public static String getCbimcCn(String url) throws IOException {
Document document = Jsoup.connect(url).get();
Elements select = document.select("div.left-l");
return select.html();
}
/**
* www.news.cn
* @param url
* @return
*/
public static String getNewsCn(String url) throws IOException {
Document document = Jsoup.connect(url).get();
String title = document.select("div.header-cont").html();
String content = Objects.requireNonNull(document.getElementById("detailContent")).html();
return title + content;
}
/**
* e-chinalife.com
* @param url
* @return
*/
public static String getChinalife(String url) throws IOException {
Document document = Jsoup.connect(url).get();
String content = document.select("div.darticle").html();
return content;
}
/**
* property.picc.com
* @param url
* @return
*/
public static String getPiccCom(String url) throws IOException {
Document document = Jsoup.connect(url).get();
String content = document.select("div.news-infoBox-content").html();
return content;
}
/**
* finance.cnr.cn
* @param url
* @return
*/
public static String getFinanceCnr(String url) throws IOException {
Document document = Jsoup.connect(url).get();
String title = document.select("div.article-header").html();
String content = document.select("div.article-content").html();
return title + content;
}
/**
* www.hsbcinsurance.com.cn
* @param url
* @return
*/
public static String getHsbcinsurance(String url) throws IOException {
Document document = Jsoup.connect(url).get();
String content = document.select("div.cc-column").html();
return content;
}
/**
* finance.people.com.cn
* @param url
* @return
*/
public static String getFinancePeople(String url) throws IOException {
Document document = Jsoup.connect(url).get();
String content = document.select("div.col.col-1.fl").html();
return content;
}
/**
* finance.china.com.cn
* @param url
* @return
*/
public static String getFinanceChina(String url) throws IOException {
Document document = Jsoup.connect(url).get();
String title = document.select("div.wrap.c.top").html();
String content = document.select("div.fl.navl").html();
return title + content;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment