py1/shizhan/qiushi.py at 54a805f11c1664acd77531a23349c85ec375c014

Fork: 0

bello / py1

Find file

Newer

Older

py1 / shizhan / qiushi.py

belloving on 12 May 2016 869 bytes Initial commit

Raw Blame History

# coding:utf-8

# 抓取‘糗事百科’一页内容

import urllib
import urllib2
import re

page = 3
url = "http://www.qiushibaike.com/textnew/page/" + str(page) + "/"
useragent = "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1"
headers = {'User-Agent' : useragent}

try:
    request = urllib2.Request(url, headers = headers)
    response = urllib2.urlopen(request)
    
    content = response.read().decode('utf-8')
    patten = re.compile('<span class="c-bl touch-user-name">(.*?)</span>.*?<div class="mlr mt10 content-text">(.*?)</div>.*?data-votes=".*?">(.*?)</span>', re.S)
    items = re.findall(patten, content)
    for x in items:
        print "作者：" + x[0] + "内容："  + x[1] + x[2]
        
except urllib2.HTTPError, e:
    print e.code    
    print e.reason