栏目分类:
子分类:
返回
终身学习网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
终身学习网 > IT > 软件开发 > 后端开发 > Java

java爬取网页数据

Java 更新时间:发布时间: 百科书网 趣学号
1、maven依赖
  
    
      junit
      junit
      4.11
      test
    

    
      org.jsoup
      jsoup
      1.10.2
    

    
      org.apache.httpcomponents
      httpcore
      4.4.10
    
    
      org.apache.httpcomponents
      httpclient
      4.5.6
    

    
      commons-io
      commons-io
      2.6
    

    
      org.slf4j
      slf4j-log4j12
      1.7.21
    

    
      org.projectlombok
      lombok
      1.18.20
    

    
      com.alibaba
      easyexcel
      2.2.6
    
  
2、 实体类
@Data
public class AreaInfo {
    @ExcelProperty(index = 0)
    private String areaUrl;
    @ExcelProperty(index = 1)
    private String city;
    @ExcelProperty(index = 2)
    private String area;
}

@Data
public class BookInfo {
    @ExcelProperty(index = 2)
    private String grade;
    @ExcelProperty(index = 3)
    private String subject;
    @ExcelProperty(index = 0)
    private String city;
    @ExcelProperty(index = 1)
    private String area;
    @ExcelProperty(index = 4)
    private String version;
}

@Data
public class CityInfo {
    private String city;
    private String url;
}

@Data
public class NoBookArea {
    @ExcelProperty(index = 0)
    private String city;
    @ExcelProperty(index = 1)
    private String area;
    @ExcelProperty(index = 2)
    private String grade;
}
3、爬取获取所有需要遍历的网页地址,写在excel中
public class GetAllAreaInfo {
    public static void main(String[] args) throws IOException {
        getAllCity("xxxx");
    }

    public static void getAllCity(String baseUrl) throws IOException {
        //获取到整个页面对象
        document document = Jsoup.parse(new URL(baseUrl), 1000000);
        //根据标签id获取到标签元素所对应的整个元素对象
        Element citylist = document.getElementById("citylist");
        //获取到所有a标签
        Elements as = citylist.getElementsByTag("a");
        //将所有a标签的链接和值取出来
        List cityAreaList = new ArrayList<>();
        for (Element a : as) {
            String path = a.attr("href");
            String text = a.text();
            CityInfo cityArea = new CityInfo();
            cityArea.setCity(text);
            cityArea.setUrl("xxxx" + path);
            cityAreaList.add(cityArea);
            System.out.println(cityArea);
        }
        //将含有链接的对象传给getAllArea方法,获取到所有区域
        getAllArea(cityAreaList);
    }
    public static void getAllArea(List cityAreaList) throws IOException {
        int vaild=0;
        int invaild=0;
        List areaInfoList = new linkedList<>();
        for (CityInfo cityInfo:cityAreaList) {
            String cityUrl = cityInfo.getUrl();
            try{
                //获取到整个页面对象
                Connection connect = Jsoup.connect(cityUrl);
                document document = connect.get();
                //根据标签class获取到标签元素所对应的整个元素对象
                if(document.getElementsByClass("Districtlist").size()!=0&&document.getElementsByClass("Districtlist").size()!=3){
                    Elements districtlist = document.getElementsByClass("Districtlist");
                    //获取到所有a标签
                    Elements as = districtlist.first().getElementsByTag("a");
                    for (Element a : as) {
                        String href = a.attr("href");
                        String text = a.text();
                        System.out.println(href+" "+text);
                        //判断,不选取小学,初中和高中这个url地址
                        if(!(text.equals("小学")||text.equals("初中")||text.equals("高中"))){
                            //将区域的相关信息(url地址,城市,区域)存入到区域对象中
                            AreaInfo areaInfo = new AreaInfo();
                            areaInfo.setCity(cityInfo.getCity());
                            areaInfo.setAreaUrl("http://www.dzkbw.com" + href);
                            areaInfo.setArea(text);
                            //将区域url,相关信息获取写入list集合中
                            areaInfoList.add(areaInfo);
                        }
                    }
                }else{
                    //这个市下面没有分区域的电子教材
                    System.out.println(cityUrl+"--------------");
                    AreaInfo areaInfo = new AreaInfo();
                    areaInfo.setCity(cityInfo.getCity());
                    areaInfo.setAreaUrl(cityInfo.getUrl());
                    areaInfo.setArea(cityInfo.getCity());
                    //将区域url,相关信息获取写入list集合中
                    areaInfoList.add(areaInfo);
                }
                vaild++;
            }catch (Exception e){
                invaild++;
                e.printStackTrace();
            }
        }
        System.out.println("总共城市数:"+cityAreaList.size());
        System.out.println("页面可以访问的城市数:"+vaild);
        System.out.println("页面失效不可以访问的城市数:"+invaild);
        //将获取的区域信息写入excel中
        writeExcel(areaInfoList);
    }
    public static void writeExcel(List list){
        EasyExcel.write("D:\areaInfo.xlsx")
                .head(AreaInfo.class)
                .excelType(ExcelTypeEnum.XLSX)
                .sheet("area")
                .doWrite(list);
    }

    public static List readExcel(){
        final List list = new ArrayList<>();
        EasyExcel.read("D:\areaInfo.xlsx")
                .head(AreaInfo.class)
                .sheet()
                .registerReadListener(new AnalysisEventListener() {
                    @Override
                    public void invoke(AreaInfo areaInfo, AnalysisContext analysisContext) {
                        list.add(areaInfo);
                    }

                    @Override
                    public void doAfterAllAnalysed(AnalysisContext analysisContext) {
                        System.out.println("数据读取完毕");
                    }
                }).doRead();
        return list;
    }
}

4、将excel中所有需要查询的html存储在linkedBlockingQueue中,多线程的情况下,每次都从队列头部获取一个网址地址,发送请求,解析页面获取数据。将解析的对象数据存储在CopyOnWriteArrayList集合中,写入excel中。
public class GetBookInfoByArea implements Runnable{

    //存放所有遍历出来的教材信息,存放再线程安全的list中
    private static List list = new CopyOnWriteArrayList<>();
    //使用阻塞队列存放所有区域的信息,每次都从这个队列获取区域地址,发送请求进行解析
    private static linkedBlockingQueue areaInfolinkedBlockingQueue = readExcel();

    @Override
    public void run() {
        //获取队列中的area对象,获取到url地址,然后发送http请求,获取到所需要的信息,然后存储在list中,最后写入excel中
        //获取到整个页面对象
        while(areaInfolinkedBlockingQueue.peek()!=null){
            //获取到区域对应的url地址
            AreaInfo areaInfo = areaInfolinkedBlockingQueue.poll();
            try{
                Connection connect = Jsoup.connect(areaInfo.getAreaUrl());
                document document = connect.get();
                getBookInfo(document,areaInfo);
                writeExcel(list);
            }catch (Exception e){
                e.printStackTrace();
            }
        }
    }

    //获取各个年级的教材信息,这块不好根据id获取标签,只能根据class获取,所以比较麻烦
    public static void getBookInfo(document document,AreaInfo areaInfo){
        //获取一年级和高二的相关信息
        if (document.getElementsByClass("i_d i_blue").size()!=0){
            Elements i_d_i_blue = document.getElementsByClass("i_d i_blue");
            for (Element e: i_d_i_blue) {
                Element h3 = e.getElementsByTag("h3").first();
                Elements lis = e.getElementsByTag("li");
                for (Element li:lis){
                    //将相关教材信息写在BookInfo对象里
                    BookInfo bookInfo = new BookInfo();
                    bookInfo.setCity(areaInfo.getCity());
                    bookInfo.setArea(areaInfo.getArea());
                    bookInfo.setGrade(h3.text());
                    bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
                    bookInfo.setVersion(li.getElementsByTag("i").text());
                    System.out.println(bookInfo.toString());
                    list.add(bookInfo);
                }
            }
        }
        //获取二年级和高三的相关信息
        if (document.getElementsByClass("i_d i_green").size()!=0) {
            Elements i_d_i_green = document.getElementsByClass("i_d i_green");
            for (Element e: i_d_i_green) {
                Element h3 = e.getElementsByTag("h3").first();
                Elements lis = e.getElementsByTag("li");
                for (Element li:lis){
                    //将相关教材信息写在BookInfo对象里
                    BookInfo bookInfo = new BookInfo();
                    bookInfo.setCity(areaInfo.getCity());
                    bookInfo.setArea(areaInfo.getArea());
                    bookInfo.setGrade(h3.text());
                    bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
                    bookInfo.setVersion(li.getElementsByTag("i").text());
                    System.out.println(bookInfo.toString());
                    list.add(bookInfo);
                }
            }
        }
        //获取三年级的相关信息
        if (document.getElementsByClass("i_d i_purple").size()!=0){
            Elements i_d_i_purple = document.getElementsByClass("i_d i_purple");
            for (Element e: i_d_i_purple) {
                Element h3 = e.getElementsByTag("h3").first();
                Elements lis = e.getElementsByTag("li");
                for (Element li:lis){
                    //将相关教材信息写在BookInfo对象里
                    BookInfo bookInfo = new BookInfo();
                    bookInfo.setCity(areaInfo.getCity());
                    bookInfo.setArea(areaInfo.getArea());
                    bookInfo.setGrade(h3.text());
                    bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
                    bookInfo.setVersion(li.getElementsByTag("i").text());
                    System.out.println(bookInfo.toString());
                    list.add(bookInfo);
                }
            }
        }
        //获取四年级的相关信息
        if (document.getElementsByClass("i_d i_red").size()!=0){
            Elements i_d_i_red = document.getElementsByClass("i_d i_red");
            for (Element e: i_d_i_red) {
                Element h3 = e.getElementsByTag("h3").first();
                Elements lis = e.getElementsByTag("li");
                for (Element li:lis){
                    //将相关教材信息写在BookInfo对象里
                    BookInfo bookInfo = new BookInfo();
                    bookInfo.setCity(areaInfo.getCity());
                    bookInfo.setArea(areaInfo.getArea());
                    bookInfo.setGrade(h3.text());
                    bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
                    bookInfo.setVersion(li.getElementsByTag("i").text());
                    System.out.println(bookInfo.toString());
                    list.add(bookInfo);
                }
            }
        }
        //获取五年级的相关信息
        if (document.getElementsByClass("i_d i_preen").size()!=0){
            Elements i_d_i_preen = document.getElementsByClass("i_d i_preen");
            for (Element e: i_d_i_preen) {
                Element h3 = e.getElementsByTag("h3").first();
                Elements lis = e.getElementsByTag("li");
                for (Element li:lis){
                    //将相关教材信息写在BookInfo对象里
                    BookInfo bookInfo = new BookInfo();
                    bookInfo.setCity(areaInfo.getCity());
                    bookInfo.setArea(areaInfo.getArea());
                    bookInfo.setGrade(h3.text());
                    bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
                    bookInfo.setVersion(li.getElementsByTag("i").text());
                    System.out.println(bookInfo.toString());
                    list.add(bookInfo);
                }
            }
        }
        //获取六年级的相关信息
        if (document.getElementsByClass("i_d i_blue2").size()!=0){
            Elements i_d_i_blue2 = document.getElementsByClass("i_d i_blue2");
            for (Element e: i_d_i_blue2) {
                Element h3 = e.getElementsByTag("h3").first();
                Elements lis = e.getElementsByTag("li");
                for (Element li:lis){
                    //将相关教材信息写在BookInfo对象里
                    BookInfo bookInfo = new BookInfo();
                    bookInfo.setCity(areaInfo.getCity());
                    bookInfo.setArea(areaInfo.getArea());
                    bookInfo.setGrade(h3.text());
                    bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
                    bookInfo.setVersion(li.getElementsByTag("i").text());
                    System.out.println(bookInfo.toString());
                    list.add(bookInfo);
                }
            }
        }
        //获取七年级的相关信息
        if (document.getElementsByClass("i_d i_green2").size()!=0){
            Elements i_d_i_green2 = document.getElementsByClass("i_d i_green2");
            for (Element e: i_d_i_green2) {
                Element h3 = e.getElementsByTag("h3").first();
                Elements lis = e.getElementsByTag("li");
                for (Element li:lis){
                    //将相关教材信息写在BookInfo对象里
                    BookInfo bookInfo = new BookInfo();
                    bookInfo.setCity(areaInfo.getCity());
                    bookInfo.setArea(areaInfo.getArea());
                    bookInfo.setGrade(h3.text());
                    bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
                    bookInfo.setVersion(li.getElementsByTag("i").text());
                    System.out.println(bookInfo.toString());
                    list.add(bookInfo);
                }
            }
        }
        //获取八年级的相关信息
        if (document.getElementsByClass("i_d i_purple2").size()!=0){
            Elements i_d_i_purple2 = document.getElementsByClass("i_d i_purple2");
            for (Element e: i_d_i_purple2) {
                Element h3 = e.getElementsByTag("h3").first();
                Elements lis = e.getElementsByTag("li");
                for (Element li:lis){
                    //将相关教材信息写在BookInfo对象里
                    BookInfo bookInfo = new BookInfo();
                    bookInfo.setCity(areaInfo.getCity());
                    bookInfo.setArea(areaInfo.getArea());
                    bookInfo.setGrade(h3.text());
                    bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
                    bookInfo.setVersion(li.getElementsByTag("i").text());
                    System.out.println(bookInfo.toString());
                    list.add(bookInfo);
                }
            }
        }
        //获取九年级的相关信息
        if (document.getElementsByClass("i_d i_red2").size()!=0){
            Elements i_d_i_red2 = document.getElementsByClass("i_d i_red2");
            for (Element e: i_d_i_red2) {
                Element h3 = e.getElementsByTag("h3").first();
                Elements lis = e.getElementsByTag("li");
                for (Element li:lis){
                    //将相关教材信息写在BookInfo对象里
                    BookInfo bookInfo = new BookInfo();
                    bookInfo.setCity(areaInfo.getCity());
                    bookInfo.setArea(areaInfo.getArea());
                    bookInfo.setGrade(h3.text());
                    bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
                    bookInfo.setVersion(li.getElementsByTag("i").text());
                    System.out.println(bookInfo.toString());
                    list.add(bookInfo);
                }
            }
        }
        //获取高一年级的相关信息
        if (document.getElementsByClass("i_d i_preen2").size()!=0){
            Elements i_d_i_preen2 = document.getElementsByClass("i_d i_preen2");
            for (Element e: i_d_i_preen2) {
                Element h3 = e.getElementsByTag("h3").first();
                Elements lis = e.getElementsByTag("li");
                for (Element li:lis){
                    //将相关教材信息写在BookInfo对象里
                    BookInfo bookInfo = new BookInfo();
                    bookInfo.setCity(areaInfo.getCity());
                    bookInfo.setArea(areaInfo.getArea());
                    bookInfo.setGrade(h3.text());
                    bookInfo.setSubject(li.getElementsByTag("A").get(1).text());
                    bookInfo.setVersion(li.getElementsByTag("i").text());
                    System.out.println(bookInfo.toString());
                    list.add(bookInfo);
                }
            }
        }
    }

    //返回一个线程安全的队列,队列里存储的是area的excel里的所有AreaInfo对象,便于多线程下每个线程从队列头部获取url地址。
    public static linkedBlockingQueue readExcel(){
        final linkedBlockingQueue areaInfolinkedBlockingQueue = new linkedBlockingQueue<>();
        EasyExcel.read("D:\可正常访问的城市区域信息.xlsx")
                .head(AreaInfo.class)
                .sheet()
                .registerReadListener(new AnalysisEventListener() {
                    @Override
                    public void invoke(AreaInfo areaInfo, AnalysisContext analysisContext) {
                        areaInfolinkedBlockingQueue.add(areaInfo);
                    }
                    @Override
                    public void doAfterAllAnalysed(AnalysisContext analysisContext) {
                        System.out.println("数据读取完毕");
                    }
                }).doRead();
        return areaInfolinkedBlockingQueue;
    }
    //将结果写入excel中
    public static void writeExcel(List list){
        EasyExcel.write("D:\各区教材信息.xlsx")
                .head(BookInfo.class)
                .excelType(ExcelTypeEnum.XLSX)
                .sheet("book")
                .doWrite(list);
    }
}
5、主方法
public class Main {
    public static void main(String[] args) {
        GetBookInfoByArea threads = new GetBookInfoByArea();
        //开启4个线程,相较于单线程,有效提升3倍速度。
        for(int i=0;i<4;i++){
            new Thread(threads).start();
        }
    }
}

根据网站的html结构进行解析,所以只能解析那一个网站。

转载请注明:文章转载自 www.051e.com
本文地址:http://www.051e.com/it/281832.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 ©2023-2025 051e.com

ICP备案号:京ICP备12030808号