1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
| __author__ = 'Paul'
import string import requests import urllib2 from bs4 import BeautifulSoup import xlwt
def save_excel(names): book = xlwt.Workbook(encoding="utf-8",style_compression=0) sheet1 = book.add_sheet('目录',cell_overwrite_ok=True) index = 0 for name in names: print name sheet1.write(index,0,name.encode('utf-8')) index += 1 book.save('index.xls')
def get_Paper_Names(soup): name = [] start = 0 for pDiv in soup.find_all('p'): if start < 7: start += 1 continue else: content = pDiv.getText().split('/')[0] name.append(content) start += 1 return name
def download_file(url, index): local_filename = './ijcai-papers2/' + url.split('/')[-1].split('.')[0] + index + '.pdf' r = requests.get(url, stream = True) with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size = 1024): if chunk: f.write(chunk) f.flush() return local_filename
root_link = "http://ijcai.org/Proceedings/2016" r = requests.get(root_link) if r.status_code == 200: soup = BeautifulSoup(r.text)
name = get_Paper_Names(soup) save_excel(name) index = 1 need_link = "http://ijcai.org/" exp = 1 for link in soup.find_all('a'): new_link = need_link + link.get('href') if new_link.endswith(".pdf"): if exp < index: print link exp += 1 continue print new_link title = name[index - 1].encode("utf-8") fixTitle = title.replace("”","_").replace("“","_").replace("∃-","_").replace("ℓ1","_").replace("’","").replace(':','_').replace('?','_').replace("—","_").replace("+(∇, ⊓)-","_") print fixTitle file_path = download_file(new_link,fixTitle) print "downloading:" + new_link + " -> " + file_path index += 1 exp = index print "all download finished" else: print "errors occur."
|