说明 本人最近在学习 Java 的基础用法时,开发了一个简易的爬取网站资源的 Java 程序;
预计的实现功能和特性:
Source code 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 package priv.testNet;import java.io.*;import java.net.MalformedURLException;import java.util.ArrayList;import java.net.URL;import java.util.Scanner;import java.util.regex.*;public class MiniCrawler { private String webURL; private URL url; private static final String support_regex = ".*.(js|css|xml|txt|mp4|mp3|avi|wmv|mpeg|mov|rmvb|flv|" + "jpeg|png|jpg|gif|ico|tif|tga|bmp|svg|eps|pdf|raw|psd)" ; private static final Pattern URLPattern = Pattern.compile("(//.+?)[()><'\"]" ); private static final Pattern NotAllowedName = Pattern.compile("[:/*?<>()\"\\\\]" ); public MiniCrawler (String spec) throws MalformedURLException { webURL = spec; url = new URL (webURL); } public URL getUrl () { return url; } public void setUrl (String spec) throws java.net.MalformedURLException { url = new URL (spec); webURL = spec; } private static void printEx (java.lang.Exception ex, String src) { System.out.println("[ERROR] " + ex.getMessage()); System.out.printf("[ERROR] Failed to load %s\n" , src); } private static void scanSubURL (ArrayList<String> urls, File htmlFile) throws java.io.IOException { try (Scanner initScanner = new Scanner (htmlFile)) { while (initScanner.hasNext()) { String pageLine = initScanner.nextLine(); Matcher matcher = URLPattern.matcher(pageLine); while (matcher.find()) { urls.add(matcher.group()); } } } } public void readHTML () { int count = 0 ; System.out.printf("[INFO] Crawling: %s\n" , url.toString()); try (Scanner input = new Scanner (url.openStream())) { while (input.hasNext()) { String pageLine = input.nextLine(); count += pageLine.length(); System.out.println(pageLine); } } catch (java.io.IOException io_ex) { printEx(io_ex, String.format("URL: %s" , url.toString())); } System.out.printf("[INFO] Finishing crawling bytes: %d\n" , count); } public static long downloadResource (URL url_, File file) throws java.io.IOException { if (url_.toString().matches("api" )) return 0 ; long count = 0 ; int tmp; try (BufferedInputStream input = new BufferedInputStream (url_.openStream()); BufferedOutputStream output = new BufferedOutputStream ( new FileOutputStream (file) )) { while ((tmp = input.read()) != -1 ) { output.write(tmp); ++count; } } catch (java.io.FileNotFoundException file_ex) { file.createNewFile(); return downloadResource(url_, file); } catch (java.io.IOException io_ex) { printEx(io_ex, String.format("URL: %s" , url_)); } return count; } public long scanWebsite (int maxNum, File saveDir) throws java.io.IOException { int count = 0 ; long written_bytes = 0 ; ArrayList<String> urls = new ArrayList <>(maxNum); urls.add(webURL); while (!urls.isEmpty() && count <= maxNum) { String curURL = urls.remove(0 ); curURL = curURL.substring(0 , curURL.length() - 1 ); if (!curURL.startsWith("http" )) curURL = "http:" + curURL; Matcher nameMatcher = NotAllowedName.matcher(curURL); String fileName = nameMatcher.replaceAll("-" ); if (!fileName.matches(support_regex)) fileName += ".html" ; File curWebFile = new File (saveDir, fileName); try { System.out.printf("[INFO] Downloading: %s\n" , curURL); written_bytes += downloadResource(new URL (curURL), curWebFile); } catch (MalformedURLException mex) { printEx(mex, String.format("file: %s" , fileName)); } if (!fileName.endsWith(".html" )) { continue ; } System.out.printf("[INFO] Scanning: %s\n" , curURL); try { scanSubURL(urls, curWebFile); } catch (java.io.IOException ex) { printEx(ex, String.format("URL: %s" , curURL)); } ++count; } System.out.printf("[INFO] Finished. Total: %d websites, %d bytes.\n" , count, written_bytes); return written_bytes; } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 package priv.testNet;import java.io.File;import java.net.MalformedURLException;import java.io.IOException;import java.util.Scanner;public class testCrawler { public static void main (String[] args) throws MalformedURLException, IOException { MiniCrawler crawler = new MiniCrawler ("http://127.0.0.1" ); Scanner scanner = new Scanner (System.in); File saveDir = new File ("." ); while (true ) { System.out.print("Please type in the saving directory " + "(ENTER without type = lastDir(initially '.'), quit = 'Q', test = 'T'): " ); if (scanner.hasNext()) { String dir = scanner.next(); if (dir.equals("Q" )) break ; if (dir.equals("T" )) { test(); continue ; } saveDir = new File (dir); } System.out.print("Please type in the website's URL: " ); crawler.setUrl(scanner.next()); System.out.print("Please type in the maximum number of the scanning pages: " ); crawler.scanWebsite(scanner.nextInt(), saveDir); } } public static void test () throws IOException { MiniCrawler crawler = new MiniCrawler ("https://www.baidu.com" ); System.out.printf("[INFO] init URL: %s%n" , crawler.getUrl()); File saveDir = new File (".\\test\\" ); if (!saveDir.exists()) saveDir.mkdir(); File saveFile = new File (saveDir, "baidu.html" ); singlePage(crawler, saveFile); crawler.setUrl("https://www.vcg.com/creative-image/fengjing/" ); System.out.printf("[INFO] init URL: %s%n" , crawler.getUrl()); crawler.scanWebsite(100 , saveDir); } public static void singlePage (MiniCrawler crawler, File saveFile) { try { MiniCrawler.downloadResource(crawler.getUrl(), saveFile); } catch (java.io.IOException ex) { System.out.println("[ERROR] " + ex.getMessage()); System.out.println("[ERROR] Failed to create the file." ); } } }
以上的程序已分别打包为二进制包(jar):🔗testCrawler.jar 、Windows可执行文件(exe):🔗testCrawler.exe
Windows双击运行即可尝试;
其他平台须下载 JRE(version ≥ 1.7)并配置好Java环境变量后,再执行:java -jar testCralwer.jar
;
写在最后 本人小白,如有技术问题或Bug,欢迎交流!
【WARINING】本样例很可能会停止维护⚠
万万没想到的是,以上的代码还不如wget
功能强大,大家可以在安装了wget
的系统上执行:
1 wget -c -r -np -k -L -p <URl>
一行搞定爬取! 只能说GNU还是非常优秀的……