少年游

欲买桂花同载酒,终不似,少年游。

0%

IJCAI论文爬虫

爬虫

最简易的爬虫

1
2
3
4
5
6
# requests.get([link])
# status_code:200 成功

r = requests.get(root_link)
if r.status_code == 200:
soup = BeautifulSoup(r.text)

保存为pdf文档

1
2
3
4
5
6
7
8
9
10
11
12
def download_file(url, index):
# 工作目录相对地址
local_filename = './ijcai-papers/' + url.split('/')[-1].split('.')[0] + index + '.pdf'
# NOTE the stream=True parameter
r = requests.get(url, stream = True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024):
if chunk: # filter out keep-alive new chunks
#f.write(r.content)
f.write(chunk)
f.flush()
return local_filename

源码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#-*- coding: UTF-8 -*-
__author__ = 'Paul'

# email : chxu@m.scnu.edu.cn
# wechat : WX24315548
import string
import requests
import urllib2
from bs4 import BeautifulSoup
import xlwt

def save_excel(names):
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet1 = book.add_sheet('目录',cell_overwrite_ok=True)
index = 0
for name in names:
print name
sheet1.write(index,0,name.encode('utf-8'))
index += 1
book.save('index.xls')


def get_Paper_Names(soup):
name = []
start = 0
for pDiv in soup.find_all('p'):
if start < 7:
start += 1
continue
else:
# if (start - 1) % 3 == 0:
content = pDiv.getText().split('/')[0]
# print content
name.append(content)
start += 1
return name

def download_file(url, index):
local_filename = './ijcai-papers2/' + url.split('/')[-1].split('.')[0] + index + '.pdf'
# NOTE the stream=True parameter
r = requests.get(url, stream = True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024):
if chunk: # filter out keep-alive new chunks
# f.write(r.content)
f.write(chunk)
f.flush()
return local_filename

# http://ijcai.org/Proceedings/16/Papers/001.pdf
root_link = "http://ijcai.org/Proceedings/2016"
r = requests.get(root_link)
if r.status_code == 200:
soup = BeautifulSoup(r.text)
# print soup.prettify()

name = get_Paper_Names(soup)
# 生成目录
save_excel(name)
index = 1
# index = 22
need_link = "http://ijcai.org/"
exp = 1
for link in soup.find_all('a'):
new_link = need_link + link.get('href')
if new_link.endswith(".pdf"):
if exp < index:
print link
exp += 1
continue
print new_link
title = name[index - 1].encode("utf-8")
fixTitle = title.replace("”","_").replace("“","_").replace("∃-","_").replace("ℓ1","_").replace("’","").replace(':','_').replace('?','_').replace("—","_").replace("+(∇, ⊓)-","_")
print fixTitle
file_path = download_file(new_link,fixTitle)
print "downloading:" + new_link + " -> " + file_path
index += 1
exp = index
print "all download finished"
else:
print "errors occur."