Newer
Older
py1 / shizhan / dongliu / list.py
ubt on 11 May 2017 4 KB first
#! /usr/bin/python
# -*- coding:utf-8 -*-

import urllib2
import urllib
import re
from socket import error as SocketError
from cookielib import CookieJar
import cookielib
import tools
import zlib
import chardet


import time


class DL_LIST:
    def __init__(self):
        self.baseUrl = 'http://www.dongliuxiaoshuo.com/n/1524.html'
        self.tool = tools.Tool()

    def writeFile(self, title, content):
        with open('./c.txt', 'ab+') as f:
            f.write(unicode.encode(title, 'utf-8') + '\n' + unicode.encode(content, 'utf-8') + '\n\n')


    def getContent(self, url, title):
        try:
            print '\n***** ' + url + ' *****'
            urls = url.split('.')
            a = urls[0].split('/')
            bid = a[2]
            rid = a[3]

            cj = CookieJar()
            cookieHandle = urllib2.HTTPCookieProcessor(cj)
            opener = urllib2.build_opener(cookieHandle)

            opener.addheaders = [
                ('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'),
                ('Referer', 'http://www.dongliuxiaoshuo.com/n/1524/979963.html'),
                ('Host', 'www.dongliuxiaoshuo.com'),
                ('X-Requested-With', 'XMLHttpRequest'),
                ('Accept-Encoding', 'gzip, deflate'),
                ('Accept-Language', 'zh-CN,zh;q=0.8,en;q=0.6'),
                ('Cache-Control', 'no-cache'),
                ('Connection', 'keep-alive'),
                ('Origin', 'http://www.dongliuxiaoshuo.com'),
                ('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8'),
                ('DNT', '1')
            ]

            formdata = {
                'bid': bid,
                'rid': rid,
                'fid': 'fb96549631c835eb239cd614cc6b5cb7d295121a'
            }
            data_encoded = urllib.urlencode(formdata)

            adr = 'http://www.dongliuxiaoshuo.com/dongliu.php'
            r = opener.open(adr, data_encoded, timeout=10)
            d = r.read()

            gzipped = r.headers.get('Content-Encoding')
            if gzipped:
                html = zlib.decompress(d, 16 + zlib.MAX_WBITS)
            else:
                html = d
            # result = chardet.detect(html)
            # print(result)
            content = html.decode("utf8")
            print content

            # self.writeFile(title.decode("utf8"), self.tool.replace(content))

        except urllib2.HTTPError, e:
            print 'HTTPError: ' + str(e.code)
            return False
        except urllib2.URLError, e:
            print 'URLError: ' + str(e.reason)
            return False
        except SocketError as e:
            print 'SocketError: ' + str(e.errno)
            return False
        except Exception as e:
            print 'Exception' + str(e.message)
            return False


    def getList(self):
        try:
            print '\n***** ' + self.baseUrl + ' *****'

            cj = CookieJar()
            cookieHandle = urllib2.HTTPCookieProcessor(cj)
            opener = urllib2.build_opener(cookieHandle)

            opener.addheaders = [
                # ('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'),
                # ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
                # ('Accept-Encoding', 'gzip, deflate, sdch'),
                # ('Accept-Language', 'zh-CN,zh;q=0.8,en;q=0.6'),
                # ('Cache-Control', 'max-age=0'),
                # ('Connection', 'keep-alive'),
                # ('Host', 'm.budejie.com')
            ]

            o = opener.open(self.baseUrl, timeout=10)
            d = o.read()

            pattern = re.compile(
                '<li class="list-group-item col-lg-3 col-md-3 col-sm-4 col-xs-6 book"><a href="(.*?)" title="(.*?)"', re.S)
            results = re.findall(pattern, d)
            for i in results:
                while self.getContent(i[0], i[1]) == False:
                    print i[1]
                    self.getContent(i[0], i[1])





        except urllib2.HTTPError, e:
            print 'HTTPError: ' + str(e.code)
            return False
        except urllib2.URLError, e:
            print 'URLError: ' + str(e.reason)
            return False
        except SocketError as e:
            print 'SocketError: ' + str(e.errno)
            return False
        except Exception as e:
            print 'Exception' + str(e.message)
            return False



lf = DL_LIST()

start = time.time()

lf.getList()

#
# for i in range(8175, 0, -1):
#     print '\n页码~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + str(i)
#     lf.wang(i)

end = time.time() - start
print '总共用时:' + str(end)