Newer
Older
py1 / shizhan / jd_mobile.py
belloving on 20 May 2016 5 KB proxy
# -*- coding:utf-8 -*-

import urllib2
import re
import csv
from socket import error as SocketError
import codecs
import time


class JD:
    def __init__(self):
        self.baseUrl = 'http://list.jd.com/list.html?cat=9987,653,655&ev=exbrand_18374&area=12,978,3391&go=0&JL=6_0_0&ms=6&page='
        self.file = '/home/ubt/mm_xiaomi.csv'
        self.paramList = []


    # 开启网络请求
    def requestOpener(self, url):
        try:
            opener = urllib2.build_opener(urllib2.HTTPHandler())
            opener.addheaders = [('User_Agent',
                                  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36')]
            response = opener.open(url, timeout=20)
            return response.read()
        except urllib2.HTTPError, e:
            print e.code
            return ''
        except urllib2.URLError, e:
            print e.reason
            return ''
        except SocketError, e:
            print 'SocketError: ' + str(e.errno)


    # 写入文件
    def writeCSV(self, boo):
        try:
            with open(self.file, 'ab') as f:
                f.write(codecs.BOM_UTF8)
                w = csv.writer(f, dialect='excel')
                if boo:
                    w.writerow(['品牌','型号','价格','上市年份','上市月份','操作系统',
                               '操作系统版本','CPU品牌','CPU型号','CPU核数',
                               'GPU','机身内存','运行内存','最大存储扩展',
                               '屏幕尺寸','分辨率','后置摄像头','前置摄像头',
                                '链接','备注','店铺','评价'])

                for x in self.paramList:
                    #print x
                    w.writerow(x)
        except Exception:
            print '写入数据失败'


    # 打开产品详情页获取参数
    def getDetailParame(self, content, url, name, comment, shop, price):
        p1 = re.compile('<td class=.*?tdTitle.*?>(.*?)<.*?<td>(.*?)<', re.S)
        r1 = re.findall(p1, content)

        parameDict = {}
        parames = []
        for i in r1:
            parameDict[i[0]] = i[1]

        print len(parameDict)
        parames.append(parameDict.get('品牌',''))
        parames.append(parameDict.get('型号',''))
        parames.append(price)
        parames.append(parameDict.get('上市年份',''))
        parames.append(parameDict.get('上市月份',''))
        parames.append(parameDict.get('操作系统',''))
        parames.append(parameDict.get('操作系统版本',''))
        parames.append(parameDict.get('CPU品牌',''))
        parames.append(parameDict.get('CPU型号',''))
        parames.append(parameDict.get('CPU核数',''))
        parames.append(parameDict.get('GPU',''))
        parames.append(parameDict.get('机身内存',''))
        parames.append(parameDict.get('运行内存',''))
        parames.append(parameDict.get('最大存储扩展',''))
        parames.append(parameDict.get('屏幕尺寸',''))
        parames.append(parameDict.get('分辨率',''))
        parames.append(parameDict.get('后置摄像头',''))
        parames.append(parameDict.get('前置摄像头',''))
        parames.append('http:'+url)
        parames.append(name)
        parames.append(shop)
        parames.append(comment)

        self.paramList.append(parames)
        print len(self.paramList)
        print '获取完成'


    # 获取产品价格
    def getPrice(self, id):
        co = self.requestOpener(
            'http://p.3.cn/prices/get?type=1&area=1_72_2799&pdtk=&pduid=360917087&pdpin=&pdbp=0&skuid=J_' + id + '&callback=cnp')
        if co != '':
            p = re.compile('"p":"(.*?)"', re.S)
            r = re.search(p, co)
            return r.group(1)
        else:
            return ''


    # 获取产品ID
    def getProductId(self, url):
        try:
            p = re.compile('/.*?/(\d.*?).htm', re.S)
            r = re.search(p, url)
            return r.group(1)
        except Exception:
            print '获取产品ID失败'
            return ''



    # 获取每个产品的详情页链接
    def getDetailUrl(self, content):
        try:
            pattern = re.compile('<div class="p-icons J-pro-icons".*?href="(.*?)".*?<em>(.*?)</em>.*?#comment">(.*?)<.*?data-shop_name="(.*?)"', re.S)
            items = re.findall(pattern, content)
            for i in items:
                print 'url----: ' + i[0] + ' name-----: ' + i[1]
                id = self.getProductId(i[0])
                if id != '':
                    price = self.getPrice(id)
                    co = self.requestOpener('http://item.m.jd.com/ware/detail.json?wareId=' + id)
                    if co != '':
                        self.getDetailParame(co, i[0], i[1], i[2], i[3], price)

        except Exception:
            print '获取产品链接失败'


    # 开始执行任务
    def start(self, fromPge, toPge):
        startTime = time.time()
        for i in range(fromPge, toPge+1):
            print '第' + str(i) + '页扫描开始~~~~~~~~~~~~~~~~'
            content = self.requestOpener(self.baseUrl + str(i))
            self.getDetailUrl(content)
            if i==fromPge:
                self.writeCSV(True)
            else:
                self.writeCSV(False)
            self.paramList = []
        print '总共用时' + str(time.time()-startTime)

jd = JD()
jd.start(1, 3)