Commit 984402fc authored by 盖献康's avatar 盖献康

按照指定条数爬取文章

parent e61a7d47
...@@ -68,6 +68,12 @@ ...@@ -68,6 +68,12 @@
<artifactId>selenium-java</artifactId> <artifactId>selenium-java</artifactId>
<version>4.0.0</version> <!-- 确保使用最新版本 --> <version>4.0.0</version> <!-- 确保使用最新版本 -->
</dependency> </dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.30</version>
</dependency>
</dependencies> </dependencies>
</project> </project>
...@@ -152,17 +152,18 @@ public class Application { ...@@ -152,17 +152,18 @@ public class Application {
BizData select = list.get(index - 1); BizData select = list.get(index - 1);
System.out.println(String.format("--好的,开始搜索【%s】的文章...", select.getNickname())); System.out.println(String.format("--好的,开始搜索【%s】的文章...", select.getNickname()));
WxResultBody<List<Article>> findExList = WeiXinApi.findExList(select.getFakeid()); // WxResultBody<List<Article>> findExList = WeiXinApi.findExList(select.getFakeid());
List<Article> exList = findExList.getApp_msg_list(); // List<Article> exList = findExList.getApp_msg_list();
for (Article article : exList) { List<Article> articleList = WeiXinApi.getArticleList(30, select.getFakeid());
for (Article article : articleList) {
System.out.println("---" + article.getTitle() + "-----" + article.getLink()); System.out.println("---" + article.getTitle() + "-----" + article.getLink());
Document document = Jsoup.connect(article.getLink()).get(); // Document document = Jsoup.connect(article.getLink()).get();
Element entiryElement = document.getElementById("img-content"); // Element entiryElement = document.getElementById("img-content");
if (entiryElement != null) { // if (entiryElement != null) {
System.out.println("--内容h5" + entiryElement.html()); // System.out.println("--内容h5" + entiryElement.html());
} // }
} }
System.out.println("---条数:"+ exList.size()); System.out.println("---条数:"+ articleList.size());
} }
} }
......
package top.iszsq.weixin.api; package top.iszsq.weixin.api;
import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.core.type.TypeReference;
import lombok.SneakyThrows;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import top.iszsq.weixin.enums.WxResultStatus; import top.iszsq.weixin.enums.WxResultStatus;
import top.iszsq.weixin.exceptions.WxApiException; import top.iszsq.weixin.exceptions.WxApiException;
...@@ -11,9 +12,8 @@ import top.iszsq.weixin.okhttp.MyCookieStore; ...@@ -11,9 +12,8 @@ import top.iszsq.weixin.okhttp.MyCookieStore;
import top.iszsq.weixin.utils.HttpUtils; import top.iszsq.weixin.utils.HttpUtils;
import top.iszsq.weixin.utils.JsonUtils; import top.iszsq.weixin.utils.JsonUtils;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.util.*;
import java.util.List; import java.util.stream.Collectors;
import java.util.Map;
/** /**
* 微信api封装 * 微信api封装
...@@ -156,7 +156,7 @@ public class WeiXinApi { ...@@ -156,7 +156,7 @@ public class WeiXinApi {
Map<String, String> params = new HashMap<>(10); Map<String, String> params = new HashMap<>(10);
params.put("action", "list_ex"); params.put("action", "list_ex");
params.put("begin", "0"); params.put("begin", "0");
params.put("count", "5"); params.put("count", "20");
params.put("fakeid", fakeid); params.put("fakeid", fakeid);
params.put("token", MyCookieStore.getToken()); params.put("token", MyCookieStore.getToken());
params.put("type", "9"); params.put("type", "9");
...@@ -173,6 +173,74 @@ public class WeiXinApi { ...@@ -173,6 +173,74 @@ public class WeiXinApi {
} }
/**
* 搜索公众号的文章(分页版本)
* @return
*/
public static WxResultBody<List<Article>> findExList(int begin, int count, String fakeId){
Map<String, String> params = new HashMap<>(10);
params.put("action", "list_ex");
params.put("begin", String.valueOf(begin));
params.put("count", String.valueOf(count));
params.put("fakeid", fakeId);
params.put("token", MyCookieStore.getToken());
params.put("type", "9");
params.put("query", "");
params.put("lang", "zh_CN");
params.put("f", "json");
params.put("ajax", "1");
WxResultBody<List<Article>> wxResultBody = parseWxResultBody(HttpUtils.doGet(URL_MAP.get("findListEx"), params),
new TypeReference<WxResultBody<List<Article>>>() {}
);
return wxResultBody;
}
/**
* 根据指定条数获取公众号文章
* @param num
* @param fakeId
* @return
*/
public static List<Article> getArticleList(int num, String fakeId) {
List<Article> articleList = new ArrayList<>();
int initialNum = 0;
int loopNum = 0;
int begin = 0;
int count = 5;
while (initialNum < num) {
if (loopNum > 0) {
begin += 5;
count += 5;
}
WxResultBody<List<Article>> exList = findExList(begin, count, fakeId);
List<Article> appMsgList = exList.getApp_msg_list();
loopNum++;
initialNum += appMsgList.size();
articleList.addAll(appMsgList);
delayedSleep();
}
List<Article> collect = articleList.stream().limit(num).collect(Collectors.toList());
return collect;
}
/**
* 延迟睡眠
*/
@SneakyThrows
public static void delayedSleep() {
Random random = new Random();
int minDelay = 200;
int maxDelay = 500;
// 生成一个随机延迟时间,范围在minDelay和maxDelay之间
int randomDelay = minDelay + random.nextInt(maxDelay - minDelay + 1);
Thread.sleep(randomDelay);
}
/** /**
* 转成java bean * 转成java bean
* @param jsonRes json结果字符串 * @param jsonRes json结果字符串
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment