说明

本人最近在学习 Java 的基础用法时,开发了一个简易的爬取网站资源的 Java 程序;

预计的实现功能和特性:

Source code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
// written by SJTU_XHW
// file: MiniCrawler.java
package priv.testNet;

import java.io.*;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.net.URL;
import java.util.Scanner;
import java.util.regex.*;

public class MiniCrawler {
private String webURL;
private URL url;

private static final String support_regex = ".*.(js|css|xml|txt|mp4|mp3|avi|wmv|mpeg|mov|rmvb|flv|" +
"jpeg|png|jpg|gif|ico|tif|tga|bmp|svg|eps|pdf|raw|psd)";
private static final Pattern URLPattern = Pattern.compile("(//.+?)[()><'\"]");
private static final Pattern NotAllowedName = Pattern.compile("[:/*?<>()\"\\\\]");

public MiniCrawler(String spec) throws MalformedURLException {
webURL = spec; url = new URL(webURL);
}

public URL getUrl() { return url; }

public void setUrl(String spec) throws java.net.MalformedURLException {
url = new URL(spec); webURL = spec;
}


private static void printEx(java.lang.Exception ex, String src) {
System.out.println("[ERROR] " + ex.getMessage());
System.out.printf("[ERROR] Failed to load %s\n", src);
}

private static void scanSubURL(ArrayList<String> urls, File htmlFile)
throws java.io.IOException {
try (Scanner initScanner = new Scanner(htmlFile)) {
while (initScanner.hasNext()) {
String pageLine = initScanner.nextLine();
Matcher matcher = URLPattern.matcher(pageLine);
while (matcher.find()) { urls.add(matcher.group()); }
}
}
}

public void readHTML() {
int count = 0;
System.out.printf("[INFO] Crawling: %s\n", url.toString());
try (Scanner input = new Scanner(url.openStream())) {
while (input.hasNext()) {
String pageLine = input.nextLine();
count += pageLine.length();
System.out.println(pageLine);
}
} catch (java.io.IOException io_ex) {
printEx(io_ex, String.format("URL: %s", url.toString()));
}
System.out.printf("[INFO] Finishing crawling bytes: %d\n", count);
}

public static long downloadResource(URL url_, File file) throws java.io.IOException {
if (url_.toString().matches("api")) return 0;
long count = 0; int tmp;
try (BufferedInputStream input = new BufferedInputStream(url_.openStream());
BufferedOutputStream output = new BufferedOutputStream(
new FileOutputStream(file)
)) {
while ((tmp = input.read()) != -1) {
output.write(tmp);
++count;
}
} catch (java.io.FileNotFoundException file_ex) {
// printEx(file_ex, String.format("file: %s", file.toString()));
file.createNewFile(); return downloadResource(url_, file);
} catch (java.io.IOException io_ex) {
printEx(io_ex, String.format("URL: %s", url_));
}
return count;
}

public long scanWebsite(int maxNum, File saveDir) throws java.io.IOException {
int count = 0; long written_bytes = 0;
ArrayList<String> urls = new ArrayList<>(maxNum);
urls.add(webURL);
while (!urls.isEmpty() && count <= maxNum) {
String curURL = urls.remove(0);
curURL = curURL.substring(0, curURL.length() - 1);
if (!curURL.startsWith("http")) curURL = "http:" + curURL;

// Download resources.
Matcher nameMatcher = NotAllowedName.matcher(curURL);
String fileName = nameMatcher.replaceAll("-");

if (!fileName.matches(support_regex)) fileName += ".html";
File curWebFile = new File(saveDir, fileName);
try {
System.out.printf("[INFO] Downloading: %s\n", curURL);
written_bytes += downloadResource(new URL(curURL), curWebFile);
} catch (MalformedURLException mex) {
printEx(mex, String.format("file: %s", fileName));
}
if (!fileName.endsWith(".html")) { continue; }

// Scanning.
System.out.printf("[INFO] Scanning: %s\n", curURL);
try { scanSubURL(urls, curWebFile); }
catch (java.io.IOException ex) { printEx(ex, String.format("URL: %s", curURL)); }
++count;
}
System.out.printf("[INFO] Finished. Total: %d websites, %d bytes.\n", count, written_bytes);
return written_bytes;
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
// file: testCrawler.java
// written by SJTU_XHW
package priv.testNet;

import java.io.File;
import java.net.MalformedURLException;
import java.io.IOException;
import java.util.Scanner;

public class testCrawler {
public static void main(String[] args) throws MalformedURLException, IOException {
MiniCrawler crawler = new MiniCrawler("http://127.0.0.1");
Scanner scanner = new Scanner(System.in);
File saveDir = new File(".");
while (true) {
System.out.print("Please type in the saving directory " +
"(ENTER without type = lastDir(initially '.'), quit = 'Q', test = 'T'): ");
if (scanner.hasNext()) {
String dir = scanner.next(); if (dir.equals("Q")) break;
if (dir.equals("T")) { test(); continue; }
saveDir = new File(dir);
}
System.out.print("Please type in the website's URL: ");
crawler.setUrl(scanner.next());
System.out.print("Please type in the maximum number of the scanning pages: ");
crawler.scanWebsite(scanner.nextInt(), saveDir);
}
}

public static void test() throws IOException {
MiniCrawler crawler = new MiniCrawler("https://www.baidu.com");
System.out.printf("[INFO] init URL: %s%n", crawler.getUrl());
File saveDir = new File(".\\test\\");
if (!saveDir.exists()) saveDir.mkdir();

File saveFile = new File(saveDir, "baidu.html");
singlePage(crawler, saveFile);

crawler.setUrl("https://www.vcg.com/creative-image/fengjing/");
System.out.printf("[INFO] init URL: %s%n", crawler.getUrl());
crawler.scanWebsite(100, saveDir);
}
public static void singlePage(MiniCrawler crawler, File saveFile) {
try {
MiniCrawler.downloadResource(crawler.getUrl(), saveFile);
} catch (java.io.IOException ex) {
System.out.println("[ERROR] " + ex.getMessage());
System.out.println("[ERROR] Failed to create the file.");
}
}
}

以上的程序已分别打包为二进制包(jar):🔗testCrawler.jar、Windows可执行文件(exe):🔗testCrawler.exe

Windows双击运行即可尝试;

其他平台须下载 JRE(version ≥ 1.7)并配置好Java环境变量后,再执行:java -jar testCralwer.jar

写在最后

本人小白,如有技术问题或Bug,欢迎交流!

【WARINING】本样例很可能会停止维护⚠

万万没想到的是,以上的代码还不如wget功能强大,大家可以在安装了wget的系统上执行:

1
wget -c -r -np -k -L -p <URl>

一行搞定爬取!只能说GNU还是非常优秀的……