开发一个极简的Java网络爬虫

说明

本人最近在学习 Java 的基础用法时，开发了一个简易的爬取网站资源的 Java 程序；

预计的实现功能和特性：

选择爬取没有设置反爬措施的网站，自定义爬取资源数量；
支持保存主流资源格式，详见Source code；
test 测试用例分别是： https://www.baidu.com 和 https://www.vcg.com/creative-image/fengjing/；

Source code

// written by SJTU_XHW
// file: MiniCrawler.java
package priv.testNet;

import java.io.*;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.net.URL;
import java.util.Scanner;
import java.util.regex.*;

public class MiniCrawler {
    private String webURL;
    private URL url;

    private static final String support_regex = ".*.(js|css|xml|txt|mp4|mp3|avi|wmv|mpeg|mov|rmvb|flv|" +
            "jpeg|png|jpg|gif|ico|tif|tga|bmp|svg|eps|pdf|raw|psd)";
    private static final Pattern URLPattern = Pattern.compile("(//.+?)[()><'\"]");
    private static final Pattern NotAllowedName = Pattern.compile("[:/*?<>()\"\\\\]");

    public MiniCrawler(String spec) throws MalformedURLException {
        webURL = spec; url = new URL(webURL);
    }

    public URL getUrl() { return url; }

    public void setUrl(String spec) throws java.net.MalformedURLException {
        url = new URL(spec); webURL = spec;
    }


    private static void printEx(java.lang.Exception ex, String src) {
        System.out.println("[ERROR] " + ex.getMessage());
        System.out.printf("[ERROR] Failed to load %s\n", src);
    }

    private static void scanSubURL(ArrayList<String> urls, File htmlFile)
            throws java.io.IOException {
        try (Scanner initScanner = new Scanner(htmlFile)) {
            while (initScanner.hasNext()) {
                String pageLine = initScanner.nextLine();
                Matcher matcher = URLPattern.matcher(pageLine);
                while (matcher.find()) { urls.add(matcher.group()); }
            }
        }
    }

    public void readHTML() {
        int count = 0;
        System.out.printf("[INFO] Crawling: %s\n", url.toString());
        try (Scanner input = new Scanner(url.openStream())) {
            while (input.hasNext()) {
                String pageLine = input.nextLine();
                count += pageLine.length();
                System.out.println(pageLine);
            }
        } catch (java.io.IOException io_ex) {
            printEx(io_ex, String.format("URL: %s", url.toString()));
        }
        System.out.printf("[INFO] Finishing crawling bytes: %d\n", count);
    }

    public static long downloadResource(URL url_, File file) throws java.io.IOException {
        if (url_.toString().matches("api")) return 0;
        long count = 0; int tmp;
        try (BufferedInputStream input = new BufferedInputStream(url_.openStream());
             BufferedOutputStream output = new BufferedOutputStream(
                     new FileOutputStream(file)
             )) {
            while ((tmp = input.read()) != -1) {
                output.write(tmp);
                ++count;
            }
        } catch (java.io.FileNotFoundException file_ex) {
            // printEx(file_ex, String.format("file: %s", file.toString()));
            file.createNewFile(); return downloadResource(url_, file);
        } catch (java.io.IOException io_ex) {
            printEx(io_ex, String.format("URL: %s", url_));
        }
        return count;
    }

    public long scanWebsite(int maxNum, File saveDir) throws java.io.IOException {
        int count = 0; long written_bytes = 0;
        ArrayList<String> urls = new ArrayList<>(maxNum);
        urls.add(webURL);
        while (!urls.isEmpty() && count <= maxNum) {
            String curURL = urls.remove(0);
            curURL = curURL.substring(0, curURL.length() - 1);
            if (!curURL.startsWith("http")) curURL = "http:" + curURL;

            // Download resources.
            Matcher nameMatcher = NotAllowedName.matcher(curURL);
            String fileName = nameMatcher.replaceAll("-");

            if (!fileName.matches(support_regex)) fileName += ".html";
            File curWebFile = new File(saveDir, fileName);
            try {
                System.out.printf("[INFO] Downloading: %s\n", curURL);
                written_bytes += downloadResource(new URL(curURL), curWebFile);
            } catch (MalformedURLException mex) {
                printEx(mex, String.format("file: %s", fileName));
            }
            if (!fileName.endsWith(".html")) { continue; }

            // Scanning.
            System.out.printf("[INFO] Scanning: %s\n", curURL);
            try { scanSubURL(urls, curWebFile); }
            catch (java.io.IOException ex) { printEx(ex, String.format("URL: %s", curURL)); }
            ++count;
        }
        System.out.printf("[INFO] Finished. Total: %d websites, %d bytes.\n", count, written_bytes);
        return written_bytes;
    }
}

// file: testCrawler.java
// written by SJTU_XHW
package priv.testNet;

import java.io.File;
import java.net.MalformedURLException;
import java.io.IOException;
import java.util.Scanner;

public class testCrawler {
    public static void main(String[] args) throws MalformedURLException, IOException {
        MiniCrawler crawler = new MiniCrawler("http://127.0.0.1");
        Scanner scanner = new Scanner(System.in);
        File saveDir = new File(".");
        while (true) {
            System.out.print("Please type in the saving directory " +
                    "(ENTER without type = lastDir(initially '.'), quit = 'Q', test = 'T'): ");
            if (scanner.hasNext()) {
                String dir = scanner.next(); if (dir.equals("Q")) break;
                if (dir.equals("T")) { test(); continue; }
                saveDir = new File(dir);
            }
            System.out.print("Please type in the website's URL: ");
            crawler.setUrl(scanner.next());
            System.out.print("Please type in the maximum number of the scanning pages: ");
            crawler.scanWebsite(scanner.nextInt(), saveDir);
        }
    }

    public static void test() throws IOException {
        MiniCrawler crawler = new MiniCrawler("https://www.baidu.com");
        System.out.printf("[INFO] init URL: %s%n", crawler.getUrl());
        File saveDir = new File(".\\test\\");
        if (!saveDir.exists()) saveDir.mkdir();

        File saveFile = new File(saveDir, "baidu.html");
        singlePage(crawler, saveFile);

        crawler.setUrl("https://www.vcg.com/creative-image/fengjing/");
        System.out.printf("[INFO] init URL: %s%n", crawler.getUrl());
        crawler.scanWebsite(100, saveDir);
    }
    public static void singlePage(MiniCrawler crawler, File saveFile) {
        try {
            MiniCrawler.downloadResource(crawler.getUrl(), saveFile);
        } catch (java.io.IOException ex) {
            System.out.println("[ERROR] " + ex.getMessage());
            System.out.println("[ERROR] Failed to create the file.");
        }
    }
}

以上的程序已分别打包为二进制包（jar）：🔗testCrawler.jar、Windows可执行文件（exe）：🔗testCrawler.exe

Windows双击运行即可尝试；

其他平台须下载 JRE（version ≥ 1.7）并配置好Java环境变量后，再执行：java -jar testCralwer.jar；