Newer
Older
PythonProxy / shy.py
bdapp on 15 Mar 2019 2 KB init
from pyquery import PyQuery as pq
import requests
import time
import sys
import proxy


class SHY:
    def __init__(self):
        p = proxy.PROXY()
        self.proxy_ip_list = p.get()
        self.proxy_id = 0

    def request_html(self, url):
        try:
            userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
            head = {}
            head['User-Agent'] = userAgent

            print(self.proxy_ip_list[self.proxy_id])
            response = requests.get(url=url, headers=head, timeout=8, proxies=self.proxy_ip_list[self.proxy_id])

            if response.status_code != 200:
                print('change proxy')
                self.proxy_id += 1
                self.request_html(url)
            else:
                response.encoding = 'gbk'
                return response.text

        except Exception as e:
            print(e)
            if self.proxy_id < len(self.proxy_ip_list):
                self.proxy_id += 1
                self.request_html(url)


    def page_sort(self, page):
        try:
            url = 'http://www.cmshy.com/html/' + str(page) + '/index.html'
            html = self.request_html(url)
            if html == '':
                return

            html_content = pq(html)
            title = html_content('.directory-box .book-tit').text()
            tr_list = html_content('.directory-box .cell-items ul li')
            for i in tr_list:
                # title
                sub_title = pq(i).find('a').text()
                # url
                sub_url = pq(i).find('a').attr('href')

                content = self.detail_page(sub_url)

                self.write_file(page, title, content)

                time.sleep(2)

        except Exception as e:
            print('page sort error')
            print(e)
            self.page_sort(page)


    def write_file(self, page, title, content):
        try:
            f = open('./txt/' + str(page) + '-' + title +'.txt', 'a+')
            f.write(content)
            f.close()
        except Exception as e:
            print('write file error')
            print(e)


    def detail_page(self, url):
        try:
            html = self.request_html(url)
            if html == '':
                return
            html_content = pq(html)
            read = html_content('.read-wrap')
            c1 = pq(read)('.title').text()
            c2 = pq(read)('.author').text()
            c3 = pq(read)('.content').text()

            return c1 + '\n' + c2 + '\n' + c3 + '\n\n'

        except Exception as e:
            print('detail page error')
            print(e)
            self.detail_page(url)


if __name__ == '__main__':
    shy = SHY()
    for i in range(1736, 5000):
        print('~~~~~~~~~~ page '+str(i)+' ~~~~~~~~~~~')
        shy.page_sort(i)
        time.sleep(2)