爬天猫商品评价数量
#coding:utf-8
import requests,json,csv,sys,re
reload(sys)
sys.setdefaultencoding('utf-8')
headers = {
"accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding":"gzip, deflate, br",
"accept-language":"zh-CN,zh;q=0.8",
"cache-control":"max-age=0",
"cookie":"**",
"upgrade-insecure-requests":"1",
"user-agent":"",
}
csvfile = open('result_tmall.csv',r'wb')
writer = csv.writer(csvfile)
writer.writerow(('link','allcom'))
for link in open('get_com_sku_tmall.txt'):
sku = link.strip().split('=')[1]
print sku
try:
url = 'https://rate.tmall.com/list_detail_rate.htm?itemId=%s&spuId=%s&sellerId=%s' %(sku,sku,sku)
r = requests.get(url,headers=headers)
total_ = re.search('"total":(\d+)',r.content)
ttl = total_.group(0).replace('"','').replace(':','').replace('total','')
writer.writerow((link.strip(),ttl))
except Exception,e:
print e
with open('get_com_sku_fail_tmall.txt',r'a+') as my:
my.write(link.strip()+'\n')
csvfile.close()爬天猫商品评价标签“大家都写到”
#coding:utf-8
import requests,json,csv,re
#
#
def main():
csvfile = open('res_tmall.csv','wb')
write = csv.writer(csvfile)
write.writerow(('link','tags'))
for link in open('sku_tmall.txt'):
headers = {
"accept":"*/*",
"accept-encoding":"gzip, deflate, br",
"accept-language":"zh-CN,zh;q=0.8",
"Cookie":'**',
"referer":"%s"%link.strip(),
"user-agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
}
proid = link.strip().split('=')[1]
print proid
try:
url = 'https://rate.tmall.com/listTagClouds.htm'
payload = {
'itemId':'%s'%proid,
'isAll':'true',
'isInner':'true',
't':'1508397490861',
'_ksTS':'1508397490862_1297',
'callback':'jsonp1298',
}
r = requests.get(url,params=payload,timeout=60,headers=headers)
print r.url
if r.status_code:
tags = re.findall(r'"tag":"(.*?)"',r.content)
write.writerow((link.strip(),'|'.join(tags)))
except Exception,e:
print e
with open('fail.txt',r'a+') as f:
f.write(link.strip()+'\n')
if __name__ == '__main__':
main()获取天猫商品颜色分类个数
#coding:utf-8
from bs4 import BeautifulSoup
import requests,sys,time
reload(sys)
sys.setdefaultencoding('utf-8')
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
"cache-control": "max-age=0",
"Cookie": "",
"referer":"https://www.tmall.com/?spm=a220o.1000855.0.0.5c90c348cacB8C",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "",
}
def main():
num = 0
for item in open('re_tmall.txt'):
skunum = 0
time.sleep(0.5)
try:
url = 'http:'+item.strip().split('|')[1].split('&')[0]
r = requests.get(url,headers=headers,timeout=60)
s = BeautifulSoup(r.content,"lxml")
except Exception,e:
print e
try:
urltag = s.find('ul',attrs={"class":"tm-clear J_TSaleProp tb-img "}).find_all('li')
for li in urltag:
skunum += 1
except Exception,e:
print e
sku_list = []
try:
yulan = s.find('ul',attrs={"id":"J_UlThumb"}).find_all('img')
for img in yulan:
imgurl = img.get('src')
sku_list.append(imgurl)
except Exception,e:
print e
try:
with open('re_tmall_skunum_imgurl.txt',r'a+') as my:
my.write(item.strip()+'|'+str(skunum)+'|'+",".join(sku_list)+'\n')
except Exception,e:
print e
num += 1
print num,skunum
break
if __name__ == '__main__':
main()天猫列表页入参关键词,爬商品数据
#coding:utf-8
from bs4 import BeautifulSoup
import requests,sys,time
reload(sys)
sys.setdefaultencoding('utf-8')
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8",
"cache-control": "max-age=0",
"Cookie": "",
"referer":"https://www.tmall.com/?spm=a220o.1000855.0.0.5c90c348cacB8C",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "",
}
#https://list.tmall.com/search_product.htm?&smAreaId=110100
#&smToken=d5ab69fafe104a119d1b68cf5dee8e2b&smSign=tynCTnJ2%2BacN%2F4nGJImRRQ%3D%3D
#&smToken=&smSign=
def main():
for kw in open('kw_tmall.txt'):
for page in range(5):
time.sleep(1)
page = page * 60
url = 'https://list.tmall.com/search_product.htm'
payload = {
'q':'%s'%kw.strip(),
's':'%s'%page,
'spm':'a220m.1000858.a2227oh.d100',
'sort':'d',
'smToken':'f00c864b1da14df796fd2805498c65af',
'cat':'50036568',
'style':'g',
'from':'.list.pc_1_searchbutton',
'smAreaId':'110100',
'smSign':'6eD%2BBFqGQ7CwR0gGpXh4HA%3D%3D',
}
r = requests.get(url,params=payload,headers=headers,timeout=60)
s = BeautifulSoup(r.content,"lxml")
divtag = s.find_all('div',attrs={'class':'product-iWrap'})
for at in divtag:
sku = at.find('a').get('href')#商品链接
if at.find('img').get('src'):#图片延时加载,导致img标签属性不一致,需要判断
img = at.find('img').get('src')
else:
img = at.find('img').get('data-ks-lazyload')
price = at.find('em').get('title')#价格
title = at.find('p',attrs={"class":"productTitle"}).find('a').get('title')
print sku,img,price,title
with open('re_tmall.txt',r'a+') as my:
my.write(kw.strip()+'|'+sku+'|'+price+'|'+title+'\n')
break
break
if __name__ == '__main__':
main()京东的也很容易,在github上,自己找吧