Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
sq-weixin-api
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
庄新伟
sq-weixin-api
Commits
235218a7
Commit
235218a7
authored
Jun 05, 2024
by
盖献康
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
爬取网站demo
parent
a0452c60
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
221 additions
and
9 deletions
+221
-9
pom.xml
pom.xml
+6
-0
Application.java
src/main/java/top/iszsq/weixin/Application.java
+2
-9
Website.java
src/main/java/top/iszsq/weixin/Website.java
+213
-0
No files found.
pom.xml
View file @
235218a7
...
@@ -62,6 +62,12 @@
...
@@ -62,6 +62,12 @@
<artifactId>
jsoup
</artifactId>
<artifactId>
jsoup
</artifactId>
<version>
1.14.3
</version>
<version>
1.14.3
</version>
</dependency>
</dependency>
<dependency>
<groupId>
org.seleniumhq.selenium
</groupId>
<artifactId>
selenium-java
</artifactId>
<version>
4.0.0
</version>
<!-- 确保使用最新版本 -->
</dependency>
</dependencies>
</dependencies>
</project>
</project>
src/main/java/top/iszsq/weixin/Application.java
View file @
235218a7
...
@@ -159,17 +159,10 @@ public class Application {
...
@@ -159,17 +159,10 @@ public class Application {
Document
document
=
Jsoup
.
connect
(
article
.
getLink
()).
get
();
Document
document
=
Jsoup
.
connect
(
article
.
getLink
()).
get
();
Element
entiryElement
=
document
.
getElementById
(
"img-content"
);
Element
entiryElement
=
document
.
getElementById
(
"img-content"
);
if
(
entiryElement
!=
null
)
{
if
(
entiryElement
!=
null
)
{
String
articleTitle
=
Objects
.
requireNonNull
(
entiryElement
.
select
(
"#activity-name"
)).
text
();
System
.
out
.
println
(
"--内容h5"
+
entiryElement
.
html
());
System
.
out
.
println
(
"标题---"
+
articleTitle
+
"----"
);
Element
mainContent
=
entiryElement
.
getElementById
(
"js_content"
);
assert
mainContent
!=
null
;
System
.
out
.
println
(
"内容---"
+
mainContent
.
text
()
+
"----"
);
Elements
imgs
=
mainContent
.
getElementsByTag
(
"img"
);
for
(
Element
img
:
imgs
)
{
System
.
out
.
println
(
"图片---"
+
img
.
attr
(
"data-src"
));
}
}
}
}
}
System
.
out
.
println
(
"---条数:"
+
exList
.
size
());
}
}
}
}
...
...
src/main/java/top/iszsq/weixin/Website.java
0 → 100644
View file @
235218a7
package
top
.
iszsq
.
weixin
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.jsoup.nodes.Entities
;
import
org.jsoup.select.Elements
;
import
org.openqa.selenium.By
;
import
org.openqa.selenium.OutputType
;
import
org.openqa.selenium.TakesScreenshot
;
import
org.openqa.selenium.WebDriver
;
import
org.openqa.selenium.chrome.ChromeDriver
;
import
org.openqa.selenium.support.ui.ExpectedConditions
;
import
org.openqa.selenium.support.ui.WebDriverWait
;
import
java.io.File
;
import
java.io.IOException
;
import
java.nio.file.Files
;
import
java.nio.file.Paths
;
import
java.time.Duration
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.Objects
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
/**
* 爬取网站所用demo
* @author gxk
* https://www.iachina.cn/col/col23/index.html
*/
public
class
Website
{
public
static
String
url
=
"https://www.iachina.cn/col/col23/index.html"
;
public
static
String
date
=
"20234-01-01"
;
public
static
void
main
(
String
[]
args
)
throws
IOException
{
// 通过网址获取Document信息
Document
document
=
Jsoup
.
connect
(
url
).
get
();
String
html
=
document
.
html
();
// 正则拿取隐藏的信息
String
regexDiv
=
"<record><!\\[CDATA\\[(.*?)]]></record>"
;
Pattern
pattern
=
Pattern
.
compile
(
regexDiv
,
Pattern
.
DOTALL
);
Matcher
matcher
=
pattern
.
matcher
(
html
);
List
<
String
>
list
=
new
ArrayList
<>();
// 把本年的文章拿取
while
(
matcher
.
find
())
{
String
group
=
matcher
.
group
(
1
);
String
nowDate
=
Jsoup
.
parse
(
group
).
select
(
"div.date"
).
text
();
int
i
=
date
.
compareTo
(
nowDate
);
if
(
i
<
0
)
{
list
.
add
(
matcher
.
group
(
1
));
}
}
for
(
String
item
:
list
)
{
String
articleUrl
=
Jsoup
.
parse
(
item
).
getElementsByTag
(
"a"
).
attr
(
"href"
);
if
(
articleUrl
.
contains
(
"mp.weixin.qq.com"
))
{
String
weiXin
=
getWeiXin
(
articleUrl
);
System
.
out
.
println
(
"weixin--"
+
weiXin
);
}
else
if
(
articleUrl
.
contains
(
"www.cs.com.cn"
))
{
String
cscom
=
getCsComCn
(
articleUrl
);
System
.
out
.
println
(
"cscom--"
+
cscom
);
}
else
if
(
articleUrl
.
contains
(
"www.cbimc.cn"
))
{
String
cbimc
=
getCbimcCn
(
articleUrl
);
System
.
out
.
println
(
"cbimc--"
+
cbimc
);
}
else
if
(
articleUrl
.
contains
(
"www.news.cn"
))
{
String
newscn
=
getNewsCn
(
articleUrl
);
System
.
out
.
println
(
"newscn--"
+
newscn
);
}
else
if
(
articleUrl
.
contains
(
"e-chinalife.com"
))
{
String
chinalife
=
getChinalife
(
articleUrl
);
System
.
out
.
println
(
"chinalife--"
+
chinalife
);
}
else
if
(
articleUrl
.
contains
(
"property.picc.com"
))
{
String
picccom
=
getPiccCom
(
articleUrl
);
System
.
out
.
println
(
"picccom--"
+
picccom
);
}
else
if
(
articleUrl
.
contains
(
"finance.cnr.cn"
))
{
String
financecnr
=
getFinanceCnr
(
articleUrl
);
System
.
out
.
println
(
"financecnr--"
+
financecnr
);
}
else
if
(
articleUrl
.
contains
(
"www.hsbcinsurance.com.cn"
))
{
String
hsbcinsurance
=
getHsbcinsurance
(
articleUrl
);
System
.
out
.
println
(
"hsbcinsurance--"
+
hsbcinsurance
);
}
else
if
(
articleUrl
.
contains
(
"finance.people.com.cn"
))
{
String
financepeople
=
getFinancePeople
(
articleUrl
);
System
.
out
.
println
(
"financepeople--"
+
financepeople
);
}
else
if
(
articleUrl
.
contains
(
"finance.china.com.cn"
))
{
String
financechina
=
getFinanceChina
(
articleUrl
);
System
.
out
.
println
(
"financechina--"
+
financechina
);
}
}
System
.
out
.
println
(
list
.
size
());
}
/**
* mp.weixin.qq.com
* @param url
* @return
* @throws IOException
*/
public
static
String
getWeiXin
(
String
url
)
throws
IOException
{
Document
document
=
Jsoup
.
connect
(
url
).
get
();
String
html
=
Objects
.
requireNonNull
(
document
.
getElementById
(
"img-content"
)).
html
();
return
html
.
replaceAll
(
"data-src"
,
"src"
);
}
/**
* www.cs.com.cn
* @param url
* @return
*/
public
static
String
getCsComCn
(
String
url
)
throws
IOException
{
Document
document
=
Jsoup
.
connect
(
url
).
get
();
Elements
select
=
document
.
select
(
"article.cont_article"
);
return
select
.
html
();
}
/**
* www.cbimc.cn
* @param url
* @return
*/
public
static
String
getCbimcCn
(
String
url
)
throws
IOException
{
Document
document
=
Jsoup
.
connect
(
url
).
get
();
Elements
select
=
document
.
select
(
"div.left-l"
);
return
select
.
html
();
}
/**
* www.news.cn
* @param url
* @return
*/
public
static
String
getNewsCn
(
String
url
)
throws
IOException
{
Document
document
=
Jsoup
.
connect
(
url
).
get
();
String
title
=
document
.
select
(
"div.header-cont"
).
html
();
String
content
=
Objects
.
requireNonNull
(
document
.
getElementById
(
"detailContent"
)).
html
();
return
title
+
content
;
}
/**
* e-chinalife.com
* @param url
* @return
*/
public
static
String
getChinalife
(
String
url
)
throws
IOException
{
Document
document
=
Jsoup
.
connect
(
url
).
get
();
String
content
=
document
.
select
(
"div.darticle"
).
html
();
return
content
;
}
/**
* property.picc.com
* @param url
* @return
*/
public
static
String
getPiccCom
(
String
url
)
throws
IOException
{
Document
document
=
Jsoup
.
connect
(
url
).
get
();
String
content
=
document
.
select
(
"div.news-infoBox-content"
).
html
();
return
content
;
}
/**
* finance.cnr.cn
* @param url
* @return
*/
public
static
String
getFinanceCnr
(
String
url
)
throws
IOException
{
Document
document
=
Jsoup
.
connect
(
url
).
get
();
String
title
=
document
.
select
(
"div.article-header"
).
html
();
String
content
=
document
.
select
(
"div.article-content"
).
html
();
return
title
+
content
;
}
/**
* www.hsbcinsurance.com.cn
* @param url
* @return
*/
public
static
String
getHsbcinsurance
(
String
url
)
throws
IOException
{
Document
document
=
Jsoup
.
connect
(
url
).
get
();
String
content
=
document
.
select
(
"div.cc-column"
).
html
();
return
content
;
}
/**
* finance.people.com.cn
* @param url
* @return
*/
public
static
String
getFinancePeople
(
String
url
)
throws
IOException
{
Document
document
=
Jsoup
.
connect
(
url
).
get
();
String
content
=
document
.
select
(
"div.col.col-1.fl"
).
html
();
return
content
;
}
/**
* finance.china.com.cn
* @param url
* @return
*/
public
static
String
getFinanceChina
(
String
url
)
throws
IOException
{
Document
document
=
Jsoup
.
connect
(
url
).
get
();
String
title
=
document
.
select
(
"div.wrap.c.top"
).
html
();
String
content
=
document
.
select
(
"div.fl.navl"
).
html
();
return
title
+
content
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment