#!/usr/bin/python # encoding: utf-8 """ @author: @contact: zhangcdnuli@163.com @file: 單品SKU價格.py @time: 2020/7/9 17:49 """ import requests import re from lxml import etree import datetime import json from urllib import parse import pandas as pd # 商品連結 url = 'https://detail.tmall.com/item.htm?id=614248351317' # 訪問參數 headers = { 'cookie':'ucn=center; thw=cn; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; UM_distinctid=172bacef669d1d-0123a63c3604fc-7d7f582e-1fa400-172bacef66a4b1; t=9c78ab28217e5c03ac9e9b7ac4a96b17; _samesite_flag_=true; cookie2=1a12abf3e634615b0ae8c9f6a1dcff1a; _tb_token_=101363b3ae63; _utk=VocP@qJyn^AtWdm; _m_h5_tk=fafb157770efc6957271c0e289cd9053_1594290551260; _m_h5_tk_enc=ed5462a45f4bfb75d4dce7b88da873d1; v=0; cna=tWZTF6mWVysCAXrrVSqSNZlL; unb=1860101593; uc1=cookie14=UoTV6OOAIfZxhw%3D%3D&existShop=false&cookie15=VFC%2FuZ9ayeYq2g%3D%3D&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=VFC%2FuZ9aiKCaj7AzMpJs&pas=0; uc3=nk2=py6KEP%2BTZBa19g%3D%3D&vt3=F8dBxGJklTDOWW8R%2BBU%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&id2=UondFJpkLDbmXQ%3D%3D; csg=d5853e56; lgc=%5Cu9006%5Cu5929%5Cu7684%5Cu5FB7%5Cu9A6C; cookie17=UondFJpkLDbmXQ%3D%3D; dnk=%5Cu9006%5Cu5929%5Cu7684%5Cu5FB7%5Cu9A6C; skt=74abec5a239ff370; existShop=MTU5NDI4ODUwMA%3D%3D; uc4=nk4=0%40pRiFHXyoUkznaw76Zk30E4gU8N5A&id4=0%40UOE3Gr625tkvZrJYSKrwXzwxSnEm; tracknick=%5Cu9006%5Cu5929%5Cu7684%5Cu5FB7%5Cu9A6C; _cc_=VFC%2FuZ9ajQ%3D%3D; _l_g_=Ug%3D%3D; sg=%E9%A9%AC35; _nk_=%5Cu9006%5Cu5929%5Cu7684%5Cu5FB7%5Cu9A6C; cookie1=BxFh9QqyhvW6Gckpnz16DykP8lN3%2BrS712Z7YRGoMAo%3D; sgcookie=EKQH6JkpBusWqZgEAlZjv; tfstk=cMXVBQi9e-e2jsv_9K9Zhh25UGdAapIcjY-Bi6Bj19HKOqAD8sCcWacYRla4sfC..; isg=BLq60-WmsTMJKzwnIALkO7W3BeDcaz5Fayqvo8SzSM0Yt17xh_sqVViGB0NrPLbd; l=eBgcTX1rQQp7RMU-BOfZ-urza77OdBd8YuPzaNbMiOCPO7CH5HqVWZlyxFTMCnGVnsFyU3Szm2rBBRYK8yUIQxv9-e_7XPQondLh.; enc=MY%2BEp8Q61iHdln2P6tKE0i6vnZJa7bt%2FHZWEE%2FTBlTOQZeFTmJHus%2F%2BfKNvVL1WM7cFFXsBN5jNmtjsNKYdkmg%3D%3D', 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0', 'referer':'https://detail.tmall.com/item.htm?id=614248351317' } # 獲取HTML資料 response = requests.get(url,headers=headers) html = response.text selector = etree.HTML(html) # 讀取商品標題 title = selector.xpath('//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/h1/text()') 商品標題 = re.compile(u"[^\u4e00-\u9fa5]+").sub('', str(title[0])) # 讀取商家昵稱 shop = selector.xpath('//*[@id="shopExtra"]/div[1]/a/strong/text()') 商家昵稱 = shop[0] # 獲取商品ID params = parse.parse_qs(parse.urlparse(url).query) 商品ID = params['id'][0] # 獲取當前日期 日期 = datetime.datetime.now().strftime('%Y-%m-%d') # 獲取sku名稱 listsku = {'id':'名稱'} allsku = selector.xpath('//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[4]/div/div/dl[1]/dd/ul/*') for i in range(1,len(allsku)+1): path1 = '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[4]/div/div/dl[1]/dd/ul/li['+ str(i) +']/@data-value' path2 = '//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[4]/div/div/dl[1]/dd/ul/li['+ str(i) +']/@title' id = selector.xpath(path1) skuname = selector.xpath(path2) listsku[str(id[0])] = str(skuname[0]) # 獲取sku價格 text = selector.xpath('//*[@id="J_DetailMeta"]/div[1]/script[3]/text()') listmoney = {'skuid':'價格'} for match in re.finditer('{"priceCent":(.*?),"price":"(.*?)","stock":(.*?),"skuId":"(.*?)"}',text[0]): moneyinfo = json.loads(match.group()) skuid = moneyinfo['skuId'] money = moneyinfo['price'] listmoney[skuid] = money # 獲取skuID,名稱,價格,輸入到DataFrame中 skuData = pd.DataFrame(columns = ['日期','商家昵稱','商品ID','SKUID','SKU名稱','SKU價格']) for match in re.finditer('{"names":"(.*?)","pvs":"(.*?)","skuId":"(.*?)"}',text[0]): skuinfo = json.loads(match.group()) SKUID = skuinfo['skuId'] SKU名稱 = '尺碼:'+ skuinfo['names'].split(' ')[1] + '顏色分類:'+ listsku[skuinfo['pvs'][0:13]] SKU價格 = listmoney[SKUID] # 寫入到DataFrame data = {'日期':日期,'商家昵稱':商家昵稱,'商品ID':商品ID,'SKUID':SKUID,'SKU名稱':SKU名稱,'SKU價格':SKU價格} skuData = skuData.append(data,ignore_index=True) # 寫入到Excel文件 skuData.to_excel('單品價格.xls') print('完成')