Python是進行網頁爬蟲和網頁數據抓取的一個不錯語言。其中python也提供了不少模塊用於數據抓取。urllib是用於打開網頁連結的模塊,urlopen()函數用於打開網頁,bs4(BeautifulSoup模塊)用BeautifulSoup()函數處理返回html的數據。
工具/原料
python3.4
BeautifulSoup
方法/步驟
from urllib.request import urlopen
用於打開網頁
from urllib.error import HTTPError
用於處理連結異常
from bs4 import BeautifulSoup
用於處理html文檔
import re
用正則表達式匹配目標字符串
例子用關於抓取百度新聞網頁的某些圖片連結
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import re
url="http://news.baidu.com/"
try:
html=urlopen(url)
except HTTPError as e:
print(e)
try:
bsObj=BeautifulSoup(html.read())
images=bsObj.findAll("img",{"src":re.compile("http://news.baidu.com.*")})
for image in images:
print(image["src"])
except AttributeError as e:
print(e)
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
public class Capture {
public static void main(String[] args) throws MalformedURLException, IOException {
String strUrl ="http://news.baidu.com/";
URL url =new URL(strUrl);
HttpURLConnection httpConnection = (HttpURLConnection) url.openConnection();
InputStreamReader input = new InputStreamReader(httpConnection.getInputStream(),"utf-8");
BufferedReader bufferedReader = new BufferedReader(input);
String line ="";
StringBuilder stringBuilder = new StringBuilder();
while ((line =bufferedReader.readLine())!=null){
stringBuilder.append(line);
}
String string =stringBuilder.toString();
int begin =string.indexOf("<title>");
int end=string.indexOf("</title>");
System.out.println("IP address:"+string.substring(begin,end));
}