
2、 实体类junit junit 4.11 test org.jsoup jsoup 1.10.2 org.apache.httpcomponents httpcore 4.4.10 org.apache.httpcomponents httpclient 4.5.6 commons-io commons-io 2.6 org.slf4j slf4j-log4j12 1.7.21 org.projectlombok lombok 1.18.20 com.alibaba easyexcel 2.2.6
@Data
public class AreaInfo {
@ExcelProperty(index = 0)
private String areaUrl;
@ExcelProperty(index = 1)
private String city;
@ExcelProperty(index = 2)
private String area;
}
@Data
public class BookInfo {
@ExcelProperty(index = 2)
private String grade;
@ExcelProperty(index = 3)
private String subject;
@ExcelProperty(index = 0)
private String city;
@ExcelProperty(index = 1)
private String area;
@ExcelProperty(index = 4)
private String version;
}
@Data
public class CityInfo {
private String city;
private String url;
}
@Data
public class NoBookArea {
@ExcelProperty(index = 0)
private String city;
@ExcelProperty(index = 1)
private String area;
@ExcelProperty(index = 2)
private String grade;
}
3、爬取获取所有需要遍历的网页地址,写在excel中
public class GetAllAreaInfo {
public static void main(String[] args) throws IOException {
getAllCity("xxxx");
}
public static void getAllCity(String baseUrl) throws IOException {
//获取到整个页面对象
document document = Jsoup.parse(new URL(baseUrl), 1000000);
//根据标签id获取到标签元素所对应的整个元素对象
Element citylist = document.getElementById("citylist");
//获取到所有a标签
Elements as = citylist.getElementsByTag("a");
//将所有a标签的链接和值取出来
List cityAreaList = new ArrayList<>();
for (Element a : as) {
String path = a.attr("href");
String text = a.text();
CityInfo cityArea = new CityInfo();
cityArea.setCity(text);
cityArea.setUrl("xxxx" + path);
cityAreaList.add(cityArea);
System.out.println(cityArea);
}
//将含有链接的对象传给getAllArea方法,获取到所有区域
getAllArea(cityAreaList);
}
public static void getAllArea(List cityAreaList) throws IOException {
int vaild=0;
int invaild=0;
List areaInfoList = new linkedList<>();
for (CityInfo cityInfo:cityAreaList) {
String cityUrl = cityInfo.getUrl();
try{
//获取到整个页面对象
Connection connect = Jsoup.connect(cityUrl);
document document = connect.get();
//根据标签class获取到标签元素所对应的整个元素对象
if(document.getElementsByClass("Districtlist").size()!=0&&document.getElementsByClass("Districtlist").size()!=3){
Elements districtlist = document.getElementsByClass("Districtlist");
//获取到所有a标签
Elements as = districtlist.first().getElementsByTag("a");
for (Element a : as) {
String href = a.attr("href");
String text = a.text();
System.out.println(href+" "+text);
//判断,不选取小学,初中和高中这个url地址
if(!(text.equals("小学")||text.equals("初中")||text.equals("高中"))){
//将区域的相关信息(url地址,城市,区域)存入到区域对象中
AreaInfo areaInfo = new AreaInfo();
areaInfo.setCity(cityInfo.getCity());
areaInfo.setAreaUrl("http://www.dzkbw.com" + href);
areaInfo.setArea(text);
//将区域url,相关信息获取写入list集合中
areaInfoList.add(areaInfo);
}
}
}else{
//这个市下面没有分区域的电子教材
System.out.println(cityUrl+"--------------");
AreaInfo areaInfo = new AreaInfo();
areaInfo.setCity(cityInfo.getCity());
areaInfo.setAreaUrl(cityInfo.getUrl());
areaInfo.setArea(cityInfo.getCity());
//将区域url,相关信息获取写入list集合中
areaInfoList.add(areaInfo);
}
vaild++;
}catch (Exception e){
invaild++;
e.printStackTrace();
}
}
System.out.println("总共城市数:"+cityAreaList.size());
System.out.println("页面可以访问的城市数:"+vaild);
System.out.println("页面失效不可以访问的城市数:"+invaild);
//将获取的区域信息写入excel中
writeExcel(areaInfoList);
}
public static void writeExcel(List list){
EasyExcel.write("D:\areaInfo.xlsx")
.head(AreaInfo.class)
.excelType(ExcelTypeEnum.XLSX)
.sheet("area")
.doWrite(list);
}
public static List readExcel(){
final List list = new ArrayList<>();
EasyExcel.read("D:\areaInfo.xlsx")
.head(AreaInfo.class)
.sheet()
.registerReadListener(new AnalysisEventListener() {
@Override
public void invoke(AreaInfo areaInfo, AnalysisContext analysisContext) {
list.add(areaInfo);
}
@Override
public void doAfterAllAnalysed(AnalysisContext analysisContext) {
System.out.println("数据读取完毕");
}
}).doRead();
return list;
}
}
4、将excel中所有需要查询的html存储在linkedBlockingQueue中,多线程的情况下,每次都从队列头部获取一个网址地址,发送请求,解析页面获取数据。将解析的对象数据存储在CopyOnWriteArrayList集合中,写入excel中。
public class GetBookInfoByArea implements Runnable{
//存放所有遍历出来的教材信息,存放再线程安全的list中
private static List list = new CopyOnWriteArrayList<>();
//使用阻塞队列存放所有区域的信息,每次都从这个队列获取区域地址,发送请求进行解析
private static linkedBlockingQueue areaInfolinkedBlockingQueue = readExcel();
@Override
public void run() {
//获取队列中的area对象,获取到url地址,然后发送http请求,获取到所需要的信息,然后存储在list中,最后写入excel中
//获取到整个页面对象
while(areaInfolinkedBlockingQueue.peek()!=null){
//获取到区域对应的url地址
AreaInfo areaInfo = areaInfolinkedBlockingQueue.poll();
try{
Connection connect = Jsoup.connect(areaInfo.getAreaUrl());
document document = connect.get();
getBookInfo(document,areaInfo);
writeExcel(list);
}catch (Exception e){
e.printStackTrace();
}
}
}
//获取各个年级的教材信息,这块不好根据id获取标签,只能根据class获取,所以比较麻烦
public static void getBookInfo(document document,AreaInfo areaInfo){
//获取一年级和高二的相关信息
if (document.getElementsByClass("i_d i_blue").size()!=0){
Elements i_d_i_blue = document.getElementsByClass("i_d i_blue");
for (Element e: i_d_i_blue) {
Element h3 = e.getElementsByTag("h3").first();
Elements lis = e.getElementsByTag("li");
for (Element li:lis){
//将相关教材信息写在BookInfo对象里
BookInfo bookInfo = new BookInfo();
bookInfo.setCity(areaInfo.getCity());
bookInfo.setArea(areaInfo.getArea());
bookInfo.setGrade(h3.text());
bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
bookInfo.setVersion(li.getElementsByTag("i").text());
System.out.println(bookInfo.toString());
list.add(bookInfo);
}
}
}
//获取二年级和高三的相关信息
if (document.getElementsByClass("i_d i_green").size()!=0) {
Elements i_d_i_green = document.getElementsByClass("i_d i_green");
for (Element e: i_d_i_green) {
Element h3 = e.getElementsByTag("h3").first();
Elements lis = e.getElementsByTag("li");
for (Element li:lis){
//将相关教材信息写在BookInfo对象里
BookInfo bookInfo = new BookInfo();
bookInfo.setCity(areaInfo.getCity());
bookInfo.setArea(areaInfo.getArea());
bookInfo.setGrade(h3.text());
bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
bookInfo.setVersion(li.getElementsByTag("i").text());
System.out.println(bookInfo.toString());
list.add(bookInfo);
}
}
}
//获取三年级的相关信息
if (document.getElementsByClass("i_d i_purple").size()!=0){
Elements i_d_i_purple = document.getElementsByClass("i_d i_purple");
for (Element e: i_d_i_purple) {
Element h3 = e.getElementsByTag("h3").first();
Elements lis = e.getElementsByTag("li");
for (Element li:lis){
//将相关教材信息写在BookInfo对象里
BookInfo bookInfo = new BookInfo();
bookInfo.setCity(areaInfo.getCity());
bookInfo.setArea(areaInfo.getArea());
bookInfo.setGrade(h3.text());
bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
bookInfo.setVersion(li.getElementsByTag("i").text());
System.out.println(bookInfo.toString());
list.add(bookInfo);
}
}
}
//获取四年级的相关信息
if (document.getElementsByClass("i_d i_red").size()!=0){
Elements i_d_i_red = document.getElementsByClass("i_d i_red");
for (Element e: i_d_i_red) {
Element h3 = e.getElementsByTag("h3").first();
Elements lis = e.getElementsByTag("li");
for (Element li:lis){
//将相关教材信息写在BookInfo对象里
BookInfo bookInfo = new BookInfo();
bookInfo.setCity(areaInfo.getCity());
bookInfo.setArea(areaInfo.getArea());
bookInfo.setGrade(h3.text());
bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
bookInfo.setVersion(li.getElementsByTag("i").text());
System.out.println(bookInfo.toString());
list.add(bookInfo);
}
}
}
//获取五年级的相关信息
if (document.getElementsByClass("i_d i_preen").size()!=0){
Elements i_d_i_preen = document.getElementsByClass("i_d i_preen");
for (Element e: i_d_i_preen) {
Element h3 = e.getElementsByTag("h3").first();
Elements lis = e.getElementsByTag("li");
for (Element li:lis){
//将相关教材信息写在BookInfo对象里
BookInfo bookInfo = new BookInfo();
bookInfo.setCity(areaInfo.getCity());
bookInfo.setArea(areaInfo.getArea());
bookInfo.setGrade(h3.text());
bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
bookInfo.setVersion(li.getElementsByTag("i").text());
System.out.println(bookInfo.toString());
list.add(bookInfo);
}
}
}
//获取六年级的相关信息
if (document.getElementsByClass("i_d i_blue2").size()!=0){
Elements i_d_i_blue2 = document.getElementsByClass("i_d i_blue2");
for (Element e: i_d_i_blue2) {
Element h3 = e.getElementsByTag("h3").first();
Elements lis = e.getElementsByTag("li");
for (Element li:lis){
//将相关教材信息写在BookInfo对象里
BookInfo bookInfo = new BookInfo();
bookInfo.setCity(areaInfo.getCity());
bookInfo.setArea(areaInfo.getArea());
bookInfo.setGrade(h3.text());
bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
bookInfo.setVersion(li.getElementsByTag("i").text());
System.out.println(bookInfo.toString());
list.add(bookInfo);
}
}
}
//获取七年级的相关信息
if (document.getElementsByClass("i_d i_green2").size()!=0){
Elements i_d_i_green2 = document.getElementsByClass("i_d i_green2");
for (Element e: i_d_i_green2) {
Element h3 = e.getElementsByTag("h3").first();
Elements lis = e.getElementsByTag("li");
for (Element li:lis){
//将相关教材信息写在BookInfo对象里
BookInfo bookInfo = new BookInfo();
bookInfo.setCity(areaInfo.getCity());
bookInfo.setArea(areaInfo.getArea());
bookInfo.setGrade(h3.text());
bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
bookInfo.setVersion(li.getElementsByTag("i").text());
System.out.println(bookInfo.toString());
list.add(bookInfo);
}
}
}
//获取八年级的相关信息
if (document.getElementsByClass("i_d i_purple2").size()!=0){
Elements i_d_i_purple2 = document.getElementsByClass("i_d i_purple2");
for (Element e: i_d_i_purple2) {
Element h3 = e.getElementsByTag("h3").first();
Elements lis = e.getElementsByTag("li");
for (Element li:lis){
//将相关教材信息写在BookInfo对象里
BookInfo bookInfo = new BookInfo();
bookInfo.setCity(areaInfo.getCity());
bookInfo.setArea(areaInfo.getArea());
bookInfo.setGrade(h3.text());
bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
bookInfo.setVersion(li.getElementsByTag("i").text());
System.out.println(bookInfo.toString());
list.add(bookInfo);
}
}
}
//获取九年级的相关信息
if (document.getElementsByClass("i_d i_red2").size()!=0){
Elements i_d_i_red2 = document.getElementsByClass("i_d i_red2");
for (Element e: i_d_i_red2) {
Element h3 = e.getElementsByTag("h3").first();
Elements lis = e.getElementsByTag("li");
for (Element li:lis){
//将相关教材信息写在BookInfo对象里
BookInfo bookInfo = new BookInfo();
bookInfo.setCity(areaInfo.getCity());
bookInfo.setArea(areaInfo.getArea());
bookInfo.setGrade(h3.text());
bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
bookInfo.setVersion(li.getElementsByTag("i").text());
System.out.println(bookInfo.toString());
list.add(bookInfo);
}
}
}
//获取高一年级的相关信息
if (document.getElementsByClass("i_d i_preen2").size()!=0){
Elements i_d_i_preen2 = document.getElementsByClass("i_d i_preen2");
for (Element e: i_d_i_preen2) {
Element h3 = e.getElementsByTag("h3").first();
Elements lis = e.getElementsByTag("li");
for (Element li:lis){
//将相关教材信息写在BookInfo对象里
BookInfo bookInfo = new BookInfo();
bookInfo.setCity(areaInfo.getCity());
bookInfo.setArea(areaInfo.getArea());
bookInfo.setGrade(h3.text());
bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
bookInfo.setVersion(li.getElementsByTag("i").text());
System.out.println(bookInfo.toString());
list.add(bookInfo);
}
}
}
}
//返回一个线程安全的队列,队列里存储的是area的excel里的所有AreaInfo对象,便于多线程下每个线程从队列头部获取url地址。
public static linkedBlockingQueue readExcel(){
final linkedBlockingQueue areaInfolinkedBlockingQueue = new linkedBlockingQueue<>();
EasyExcel.read("D:\可正常访问的城市区域信息.xlsx")
.head(AreaInfo.class)
.sheet()
.registerReadListener(new AnalysisEventListener() {
@Override
public void invoke(AreaInfo areaInfo, AnalysisContext analysisContext) {
areaInfolinkedBlockingQueue.add(areaInfo);
}
@Override
public void doAfterAllAnalysed(AnalysisContext analysisContext) {
System.out.println("数据读取完毕");
}
}).doRead();
return areaInfolinkedBlockingQueue;
}
//将结果写入excel中
public static void writeExcel(List list){
EasyExcel.write("D:\各区教材信息.xlsx")
.head(BookInfo.class)
.excelType(ExcelTypeEnum.XLSX)
.sheet("book")
.doWrite(list);
}
}
5、主方法
public class Main {
public static void main(String[] args) {
GetBookInfoByArea threads = new GetBookInfoByArea();
//开启4个线程,相较于单线程,有效提升3倍速度。
for(int i=0;i<4;i++){
new Thread(threads).start();
}
}
}
根据网站的html结构进行解析,所以只能解析那一个网站。