当前位置:首页 > 科技 > 正文

python爬虫——站酷数据获取

1.站酷数据获取

其中需要注意,本文使用了IP代理,以及不同的作品类型,他们详情页url拼接方式不同

import random
import time
import datetime
import requests
import threading
from lxml import etree
import pymysql
class ZhankunSpider(object):
def __init__(self):
self.url = 'https://www.zcool.com.cn/p1/discover/first?p={}&ps=20'
self.mysql = pymysql.connect(host='localhost', database='tenders', port=3306, user='root',
password='123456')
self.cur = self.mysql.cursor()
self.blog = 1
def proxy_get(self):
procy = requests.get(
r'放入IP代理链接').json()['data'][0]
proxy = str(procy["ip"]) + ':' + str(procy["port"])
http = 'http://' + proxy
https = 'https://' + proxy
self.proxys = {'http': http,
'https': https}
print(self.proxys)
# result = requests.get('https://www.baidu.com/',verify=False)
result = requests.get('https://www.baidu.com/')
print(result.status_code)
if result.status_code != 200:
self.proxy_get()
time.sleep(0.2)
return
# self.expire_datetime = datetime.datetime.now() + datetime.timedelta(seconds=60)
def _check_expire(self):
self.expire_datetime = datetime.datetime.now() + datetime.timedelta(seconds=60)
if datetime.datetime.now() >= self.expire_datetime:
self.proxy_get()
# 发送请求
def get_html(self, url):
if self.blog <= 3:
try:
datas = {
'p': 'i',
'column': 5
}
headers = {'Cookie': '登陆后cookie',
'User-Agent':'',}
json_ids = requests.get(url=url, headers=headers, data=datas).json()
return json_ids
except Exception as e:
print(e)
self.blog += 1
self.get_html(url)
# 解析提取数据
def parse_html(self, url):
json_ids = self.get_html(url)
self._check_expire()
if json_ids:
time.sleep(1)
for dic in json_ids['datas']:
titles = dic['content']['title']  #题目
types = dic['content']['typeStr']
viewCountStrs = dic['content']['viewCountStr']   #浏览量
subCateStrs = dic['content']['subCateStr']
cateStrs = dic['content']['cateStr']
url13 = 'https://www.zcool.com.cn/p1/product/'+dic['content']['idStr']
urll = dic['content']['pageUrl']
headers1 = {
'Cookie': '',
'User-Agent': '', }
# self._check_expire()
if 'work' in urll:
url2 = 'https://www.zcool.com.cn/p1/product/' + dic['content']['idStr']
try:
json_idss = requests.get(url=url2, headers=headers1, proxies=self.proxys, timeout=3).json()
except:
self.proxy_get()
json_idss = requests.get(url=url2, headers=headers1, proxies=self.proxys, timeout=3).json()
time.sleep(1)
for dici in json_idss['data']['productImages']:
datass = dici['url']
else:
url2 = 'https://www.zcool.com.cn/p1/article/' + dic['content']['idStr']
try:
json_idss = requests.get(url=url2, headers=headers1, proxies=self.proxys, timeout=3).json()
except:
self.proxy_get()
json_idss = requests.get(url=url2, headers=headers1, proxies=self.proxys, timeout=3).json()
time.sleep(1)
# datass = json_idss['data']['id']
for dici in json_idss['data']['creatorObj']['contentCards']:
datass = dici['cover1x']
timeStamp = dic['content']['timeTitleStr']
# timeArray = time.localtime(timeStamp)  # 转化成对应的时间
# otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)  # 字符串
# date = otherStyleTime
photo = dic['content']['cover1x']
data = {
'title': titles,
'urls': url13,
'address': timeStamp,
'configuration': types,
'grade': viewCountStrs,
'collections': subCateStrs,
'price': cateStrs,
'unit': photo,
'photoadress': datass
}
print(data)
self.save_mysql(data)
def save_mysql(self, data):
# str_sql = "insert into ftx values(0, '{}', '{}');".format(data['first_category'],data['second_category'])
str_sql = "insert into meituan values(0, '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');".format(
data['title'],  data['urls'],data['address'], data['configuration'], data['grade'], data['collections'],
data['price'], data['unit'], data['photoadress'])
self.cur.execute(str_sql)
self.mysql.commit()
def __del__(self):
self.cur.close()
self.mysql.close()
# 入口函数
def run(self):
try:
for i in range(1,5):
url = self.url.format(i)
print(i)
# self.get_html(url)
                self.parse_html(url)
time.sleep(random.randint(2, 4))
# 每次抓取一页要初始化一次self.blog
self.blog = 1
except Exception as e:
print('发生错误', e)
if __name__ == '__main__':
spider = ZhankunSpider()
spider.run()

2.结果展示

有话要说...

取消
扫码支持 支付码