当前位置:首页 > Python爬虫 > Python爬天猫的一些不可描述的数据

Python爬天猫的一些不可描述的数据

作者:二营长 发布时间:2017-11-23 点击:

爬天猫商品评价数量

#coding:utf-8    
import requests,json,csv,sys,re    
reload(sys)    
sys.setdefaultencoding('utf-8')    
headers = {    
	"accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",    
	"accept-encoding":"gzip, deflate, br",    
	"accept-language":"zh-CN,zh;q=0.8",    
	"cache-control":"max-age=0",    
	"cookie":"**",    
	"upgrade-insecure-requests":"1",    
	"user-agent":"",    
}    
csvfile = open('result_tmall.csv',r'wb')    
writer = csv.writer(csvfile)    
writer.writerow(('link','allcom'))    
for link in open('get_com_sku_tmall.txt'):    
	sku = link.strip().split('=')[1]    
	print sku    
	try:    
		url = 'https://rate.tmall.com/list_detail_rate.htm?itemId=%s&spuId=%s&sellerId=%s' %(sku,sku,sku)    
		r = requests.get(url,headers=headers)    
		total_ = re.search('"total":(\d+)',r.content)    
		ttl = total_.group(0).replace('"','').replace(':','').replace('total','')    
		writer.writerow((link.strip(),ttl))    
	except Exception,e:    
		print e    
		with open('get_com_sku_fail_tmall.txt',r'a+') as my:    
			my.write(link.strip()+'\n')    
csvfile.close()


爬天猫商品评价标签“大家都写到”

#coding:utf-8    
import requests,json,csv,re    
#  
#  
def main():    
	csvfile = open('res_tmall.csv','wb')    
	write = csv.writer(csvfile)    
	write.writerow(('link','tags'))    
	for link in open('sku_tmall.txt'):    
		headers = {    
			"accept":"*/*",    
			"accept-encoding":"gzip, deflate, br",    
			"accept-language":"zh-CN,zh;q=0.8",    
			"Cookie":'**',    
			"referer":"%s"%link.strip(),    
			"user-agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",    
		}    
		proid = link.strip().split('=')[1]    
		print proid    
		try:    
			url = 'https://rate.tmall.com/listTagClouds.htm'    
			payload = {    
			'itemId':'%s'%proid,    
			'isAll':'true',    
			'isInner':'true',    
			't':'1508397490861',    
			'_ksTS':'1508397490862_1297',    
			'callback':'jsonp1298',    
			}    
			r = requests.get(url,params=payload,timeout=60,headers=headers)    
			print r.url    
			if r.status_code:    
				tags = re.findall(r'"tag":"(.*?)"',r.content)    
				write.writerow((link.strip(),'|'.join(tags)))    
		except Exception,e:    
			print e    
			with open('fail.txt',r'a+') as f:    
				f.write(link.strip()+'\n')    
if __name__ == '__main__':    
		main()


获取天猫商品颜色分类个数

#coding:utf-8    
from bs4 import BeautifulSoup    
import requests,sys,time    
reload(sys)    
sys.setdefaultencoding('utf-8')    
headers = {    
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",    
"Accept-Encoding": "gzip, deflate, sdch",    
"Accept-Language": "zh-CN,zh;q=0.8",    
"cache-control": "max-age=0",    
"Cookie": "",    
"referer":"https://www.tmall.com/?spm=a220o.1000855.0.0.5c90c348cacB8C",    
"Upgrade-Insecure-Requests": "1",    
"User-Agent": "",    
}    
def main():    
	num = 0    
	for item in open('re_tmall.txt'):    
		skunum = 0    
		time.sleep(0.5)    
		try:    
			url = 'http:'+item.strip().split('|')[1].split('&')[0]    
			r = requests.get(url,headers=headers,timeout=60)    
			s = BeautifulSoup(r.content,"lxml")    
		except Exception,e:    
			print e    
		try:    
			urltag = s.find('ul',attrs={"class":"tm-clear J_TSaleProp tb-img     "}).find_all('li')    
			for li in urltag:    
				skunum += 1    
		except Exception,e:    
			print e    
		sku_list = []    
		try:    
			yulan = s.find('ul',attrs={"id":"J_UlThumb"}).find_all('img')    
			for img in yulan:    
				imgurl = img.get('src')    
				sku_list.append(imgurl)    
		except Exception,e:    
			print e    
		try:    
			with open('re_tmall_skunum_imgurl.txt',r'a+') as my:    
				my.write(item.strip()+'|'+str(skunum)+'|'+",".join(sku_list)+'\n')    
		except Exception,e:    
			print e    
		num += 1    
		print num,skunum    
		break    
if __name__ == '__main__':    
	main()


天猫列表页入参关键词,爬商品数据

#coding:utf-8    
from bs4 import BeautifulSoup    
import requests,sys,time    
reload(sys)    
sys.setdefaultencoding('utf-8')    
headers = {    
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",    
"Accept-Encoding": "gzip, deflate, sdch",    
"Accept-Language": "zh-CN,zh;q=0.8",    
"cache-control": "max-age=0",    
"Cookie": "",    
"referer":"https://www.tmall.com/?spm=a220o.1000855.0.0.5c90c348cacB8C",    
"Upgrade-Insecure-Requests": "1",    
"User-Agent": "",    
}    
#https://list.tmall.com/search_product.htm?&smAreaId=110100
#&smToken=d5ab69fafe104a119d1b68cf5dee8e2b&smSign=tynCTnJ2%2BacN%2F4nGJImRRQ%3D%3D    
#&smToken=&smSign=    
def main():    
	for kw in open('kw_tmall.txt'):    
		for page in range(5):    
			time.sleep(1)    
			page = page * 60    
			url = 'https://list.tmall.com/search_product.htm'    
			payload = {    
			'q':'%s'%kw.strip(),    
			's':'%s'%page,    
			'spm':'a220m.1000858.a2227oh.d100',    
			'sort':'d',    
			'smToken':'f00c864b1da14df796fd2805498c65af',    
			'cat':'50036568',    
			'style':'g',    
			'from':'.list.pc_1_searchbutton',    
			'smAreaId':'110100',    
			'smSign':'6eD%2BBFqGQ7CwR0gGpXh4HA%3D%3D',    
			}    
			r = requests.get(url,params=payload,headers=headers,timeout=60)    
			s = BeautifulSoup(r.content,"lxml")    
			divtag = s.find_all('div',attrs={'class':'product-iWrap'})    
			for at in divtag:    
				sku = at.find('a').get('href')#商品链接    
				if at.find('img').get('src'):#图片延时加载,导致img标签属性不一致,需要判断    
					img = at.find('img').get('src')    
				else:    
					img = at.find('img').get('data-ks-lazyload')    
				price = at.find('em').get('title')#价格    
				title = at.find('p',attrs={"class":"productTitle"}).find('a').get('title')    
				print sku,img,price,title    
				with open('re_tmall.txt',r'a+') as my:    
					my.write(kw.strip()+'|'+sku+'|'+price+'|'+title+'\n')    
			break    
		break    
if __name__ == '__main__':    
	main()


京东的也很容易,在github上,自己找吧





邮箱:techseo.cn@gmail.com,欢迎交流。
上一篇:从XML文件取数据并做百度主动推送      下一篇:爬孔夫子旧书网