-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathcrawler.py
78 lines (71 loc) · 2.19 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import urllib.request
import re
import os
import sys
from multiprocessing.dummy import Pool
def download(download_info):
(url, file_name) = download_info
for i in range(6):
try:
with urllib.request.urlopen(url, timeout=20) as response, open(file_name, 'wb') as out_file:
data = response.read()
out_file.write(data)
return
except:
pass
print('Download failed: %s'%(url))
def mass_download(urls, nthread):
print('Downloading...')
download_infos = [(url, os.path.basename(url)) for url in urls]
with Pool(nthread) as pool:
pool.map(download, download_infos)
def get_html(url_path):
print('Fetching html...')
for i in range(5):
try:
with urllib.request.urlopen(url_path) as url:
s = str(url.read())
return s
except:
pass
print('Fetching html failed...')
def get_image_urls(html_content):
print('Parsing html...')
exp = 'objURL":"([a-z.:/_A-Z0-9]*)"'
image_urls = re.findall(exp, html_content)
print('%d images found in this page'%(len(image_urls)))
return image_urls
#reading parameters
if (len(sys.argv) < 5):
print('Usage: python crawler.py $key_word $dest_folder $num_of_images $num_of_threads')
exit()
key_word = repr(sys.argv[1].encode('UTF-8')).replace('\\x', '%').upper()[2:-1]
dest_folder = sys.argv[2]
num_image = eval(sys.argv[3])
nthread = eval(sys.argv[4])
#create and change working directory
if not os.path.exists(dest_folder):
os.makedirs(dest_folder)
os.chdir(dest_folder)
pn = 0
cnt = 0
downloaded = set()
while cnt < num_image:
print("Page %d:"%(pn+1))
image_urls = []
try:
url = "http://images.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%d&gsm=0"%(key_word, pn*15)
html_content = get_html(url)
temp_urls = get_image_urls(html_content)
for i in temp_urls:
if i not in downloaded:
downloaded.add(i)
image_urls.append(i)
mass_download(image_urls, nthread)
except KeyboardInterrupt:
exit()
except:
pass
pn += 1
cnt += len(image_urls)
print("Done.")