IJCAI论文爬虫

爬虫

最简易的爬虫

# requests.get([link])
# status_code:200 成功

r = requests.get(root_link)
if r.status_code == 200:
  soup = BeautifulSoup(r.text)

保存为pdf文档

def download_file(url, index):
  # 工作目录相对地址
  local_filename = './ijcai-papers/' + url.split('/')[-1].split('.')[0] + index + '.pdf'
  # NOTE the stream=True parameter
  r = requests.get(url, stream = True)
  with open(local_filename, 'wb') as f:
      for chunk in r.iter_content(chunk_size = 1024):
          if chunk: # filter out keep-alive new chunks
              #f.write(r.content)
              f.write(chunk)
              f.flush()
  return local_filename

源码

#-*- coding: UTF-8 -*-
__author__ = 'Paul'

# email : chxu@m.scnu.edu.cn
# wechat : WX24315548
import string
import requests
import urllib2
from bs4 import BeautifulSoup
import xlwt

def save_excel(names):
    book = xlwt.Workbook(encoding="utf-8",style_compression=0)
    sheet1 = book.add_sheet('目录',cell_overwrite_ok=True)
    index = 0
    for name in names:
        print name
        sheet1.write(index,0,name.encode('utf-8'))
        index += 1
    book.save('index.xls')


def get_Paper_Names(soup):
  name = []
  start = 0
  for pDiv in soup.find_all('p'):
      if start < 7:
          start += 1
          continue
      else:
          # if (start - 1) % 3 == 0:
          content = pDiv.getText().split('/')[0]
          # print content
          name.append(content)
          start += 1
  return name

def download_file(url, index):
  local_filename = './ijcai-papers2/' + url.split('/')[-1].split('.')[0] + index + '.pdf'
  # NOTE the stream=True parameter
  r = requests.get(url, stream = True)
  with open(local_filename, 'wb') as f:
      for chunk in r.iter_content(chunk_size = 1024):
          if chunk: # filter out keep-alive new chunks
      #         f.write(r.content)
              f.write(chunk)
              f.flush()
  return local_filename

# http://ijcai.org/Proceedings/16/Papers/001.pdf
root_link = "http://ijcai.org/Proceedings/2016"
r = requests.get(root_link)
if r.status_code == 200:
  soup = BeautifulSoup(r.text)
  # print soup.prettify()

  name = get_Paper_Names(soup)
  # 生成目录
  save_excel(name)
  index = 1
  # index = 22
  need_link = "http://ijcai.org/"
  exp = 1
  for link in soup.find_all('a'):
      new_link = need_link + link.get('href')
      if new_link.endswith(".pdf"):
          if exp < index:
              print link
              exp += 1
              continue
          print new_link
          title =  name[index - 1].encode("utf-8")
          fixTitle = title.replace("”","_").replace("“","_").replace("∃-","_").replace("ℓ1","_").replace("’","").replace(':','_').replace('?','_').replace("—","_").replace("+(∇, ⊓)-","_")
          print fixTitle
          file_path = download_file(new_link,fixTitle)
          print "downloading:" + new_link + " -> " + file_path
          index += 1
          exp = index
  print "all download finished"
else:
  print "errors occur."