利用python来对ajax数据爬取
以下例子说明
爬取数据一:
思路: 1.通过分析url 2 构造请求url 3,抓取数据 4 写入到文档,5 结省时间,使用多线程来写,
问题1: 使用多线程会不会有写的冲突?此例子中不会冲突,通过分析接口数据,可以得到总条数,总页数,每页都用一个线程来执行,组装数据后,再用另一个线程来写入文件
import requests,json
from urllib.parse import urlencode
from threading import Thread
def get_page(offset):
headers={
'Host': 'www.xx.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Referer': 'http://www.xxx.cn/plist.html'
}
url='http://www.xxxx.cn/plist.html?page='+str(offset)
try:
res=requests.get(url=url,headers=headers)
return res.json()
except Exception as e:
print(e)
return None
#组装数据
#定义全局列表
res_data=[]
def write_file(data):
# print(data)
if data is not None:
data_list=data['data']
global res_data
for item in range(len(data_list)):
dict_res={}
dict_res['name']=data_list[item]['g_name']
dict_res['url']=data_list[item]['g_img']
# print(data_list[item]['g_name'],data_list[item]['g_img'])
res_data.append(dict_res)
#通过另一个线程写文件
def write_json():
with open('data.txt','w',encoding='utf-8') as f:
json.dump(res_data,f)
print('write ok')
def run(page):
data=get_page(page)
write_file(data)
json_data=get_page(1)
if __name__ == '__main__':
per_page=get_page(1)
total_page=per_page['last_page'] #总页数
thread_list=[]
for i in range(total_page):
thread=Thread(target=run,args=(i,))
thread.start()
?
thread_list.append(thread)
#线程阻塞
for thread in thread_list:
thread.join()
#开启另一个线程写数据
thread2=Thread(target=write_json)
thread2.start()
?
print('ok')
?
案例二:爬取头条美女
import requests
from urllib.parse import urlencode
from hashlib import md5
import os
import time
from multiprocessing.pool import Pool #利用进程池来快速下载
def get_page(offset):
params={
"aid":"24",
"app_name":"web_search",
"offset":offset,
"format":"json",
"keyword":"街拍",
"autoload":"true",
"count":20,
"en_qc":1,
"cur_tab":1,
"from":"search_tab",
"pd":"synthesis",
"timestamp":"1589868205095",
"_signature":"Jd--SAAgEBCicjqlJx1oXSXe.1AAHskpOqtHpIydeeXvI8cKTz2Viav8TZqOW9OkhxJBbNwd0QlzXThsKYBWvhtWjp.kb098c2ERJVTL4rZ83Gm.t-HibQGmRwFDHPMX1Ag",
}
headers={
"Host": "www.xxxx.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
"Accept": "application/json, text/javascript",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br",
"X-Requested-With": "XMLHttpRequest",
"Content-Type": "application/x-www-form-urlencoded",
"Connection": "keep-alive",
"Referer": "https://www.xxxx.com/search/?keyword=%E8%A1%97%E6%8B%8D",
"Cookie": "passport_auth_status=4e2c956d9c382caa634e918d5172b8ff%2C0f4b06ebf9e3e318c84a0eecc9442dd5; sso_auth_status=ff08b117e07ebf4300d482cf5cc30d45; sso_uid_tt=6fea73d4793f1b79f59f5a617feaff22; sso_uid_tt_ss=6fea73d4793f1b79f59f5a617feaff22; toutiao_sso_user=39705416627edcab9c45e2b65d2d0c9f; toutiao_sso_user_ss=39705416627edcab9c45e2b65d2d0c9f; sid_guard=b85bcaa805f8516561135b4e14496c92%7C1589791339%7C5184000%7CFri%2C+17-Jul-2020+08%3A42%3A19+GMT; uid_tt=91c92c7b35228506a3625e502b21cb72; uid_tt_ss=91c92c7b35228506a3625e502b21cb72; sid_tt=b85bcaa805f8516561135b4e14496c92; sessionid=b85bcaa805f8516561135b4e14496c92; sessionid_ss=b85bcaa805f8516561135b4e14496c92; tt_webid=6828431109119641095; s_v_web_id=verify_kadidh7u_sioNDvyI_0Rzv_4LHk_8f9u_vaPraRxYLEFs; ttcid=70273172efd6499da7b5d97c6007317527; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __tasessionId=nbz73rlzz1589868028446; SLARDAR_WEB_ID=5bd84172-c5b7-4465-a2a2-647092a758fc; tt_webid=6828431109119641095; csrftoken=bbb8a94b2d289864849a2349ab12ea6d; tt_scid=vktEovYN4sRQ3ObjgWSfbhNk0rK5qwxPELQgXo-SXRE-M9RNyOb-OTz1Dr8Ju2emb318",
"TE": "Trailers"
}
url="https://www.xxxxx.com/api/search/content?"+urlencode(params)
try:
response=requests.get(url=url,headers=headers)
return response.json()
except Exception as e:
return None
?
#获取图片连接
def get_img(json_data):
if json_data.get('data'):
for item in json_data.get('data'):
title=item.get('title')
image_list=item.get('image_list')
print(title)
for image in image_list:
yield {
'title':title,
'img_url':image.get('url')
}
#保存图片
def save_img(dict_img):
#没有文件夹,先创建文件夹
if not os.path.exists(dict_img.get('title')):
os.mkdir(dict_img.get('title'))
?
try:
res=requests.get(dict_img.get('img_url'))
print(res)
if res.status_code==200:
file_path='{0}/{1}.{2}'.format(dict_img.get('title'),md5(res.content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(res.content)
else:
print('already download',file_path)
except Exception as e:
print(e)
?
def main(offset):
json=get_page(offset)
for item in get_img(json):
save_img(item)
?
group_start=1
group_end=20
if __name__ == '__main__':
pool=Pool()
groups=([x*20 for x in range(group_start,group_end+1)])
pool.map(main,groups)
pool.close()
pool.join()
?
?