Python百度图片爬虫2

视频演示搜索B站“速光网络”

import os
import re
import time
from alive_progress import alive_bar
import requests

headers = {
"Accept": 'text/plain, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Host': 'image.baidu.com',
'Referer': 'https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E6%B1%BD%E8%BD%A6&step_word=&hs=0&pn=4&spn=0&di=210760&pi=0&rn=1&tn=baiduimagedetail&is=0,0&istype=2&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs=3100769720,1311769304&os=3476023477,1580744637&simid=3386877588,386203947&adpicid=0&lpn=0&ln=1356&fr=&fmq=1611916956498_R&fm=index&ic=0&s=undefined&hd=undefined&latest=undefined&copyright=undefined&se=&sme=&tab=0&width=&height=&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=https://gimg2.baidu.com/image_search/src=http://focus123.com.cn/Uploads/images/20171117/1510909072357573.jpg&refer=http://focus123.com.cn&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=jpeg?sec=1614508959&t=4202658381c41e10862a566eb846055f&fromurl=ippr_z2C$qAzdH3FAzdH3Fooo_z&e3Bu5v7f8dn_z&e3BvgAzdH3FgjofAzdH3Ffi5oAzdH3FdAzdH3Fc8lb&gsm=1&rpstart=0&rpnum=0&islist=&querylist=&force=undefined',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.53",
'X-Requested-With': 'XMLHttpRequest'
}

url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&word='
keyword = input("输入图片关键词:")

url = url+keyword+"&pn="
time_start = time.time() # 获取初始时间

strhtml = requests.get(url, headers=headers)
string = str(strhtml.text)

totalnum = re.findall(
'<div id="resultInfo" style="font-size: 13px;">(.*?)</div>', string)
print(totalnum[0])

countmax = eval(input("请输入要爬取的图片数量:"))

img_url_regex = '"thumbURL":"(.*?)",' # 正则匹配式
count = 0 # 总共下载的图片数
index = 0 # 链接后面的序号
page = 0 # 当前搜集的页
while (1):
strhtml = requests.get(url + str(index), headers=headers) # get方式获取数据
string = str(strhtml.text)
print("已爬取网页")
pic_url = re.findall(img_url_regex, string) # 先利用正则表达式找到图片url
page += 1
print("这是第"+str(page)+"页")
# print("第" + str(page) + "页共收集到" + str(len(pic_url)) + "张图片")
index += len(pic_url) # 网址索引向后,跳到下一页继续搜刮图片
print(index)
try: # 如果没有文件夹就创建
os.mkdir('.' + r'\\' + keyword)
except:
pass

if (countmax-count) <= 30:
bar_num = countmax-count
else:
bar_num = 30

with alive_bar(bar_num) as bar:
for each in pic_url:
#print('正在下载第' + str(count + 1) + '张图片,图片地址:' + str(each))
bar()
try:
if each is not None:
pic = requests.get(each, timeout=5)
else:
continue
except BaseException:
print('错误,当前图片无法下载')
continue
else:
string = '.' + r'\\' + keyword + r'\\' + \
keyword + '_' + str(count + 1) + '.jpg'
fp = open(string, 'wb')
fp.write(pic.content)
fp.close()
count += 1
if countmax == count:
break
if countmax == count:
break
time_end = time.time() # 获取结束时间
print('处理完毕,共耗时%.2f秒: ' % (time_end - time_start))

END
如本资源侵犯了您的权益,请联系投诉邮箱admin@wmphp.com进行举报!我们将在收到邮件的1个小时内处理完毕。 本站仅为平台,发布的资源均为用户投稿或转载!所有资源仅供参考学习使用,请在下载后的24小时内删除,禁止商用! Wmphp.com(完美源码)助力正版,如您有自己的原创产品,可以联系客服投稿,代理出售! Wmphp.com(完美源码)客服QQ:136882447 Wmphp.com(完美源码)商务电话(仅对企业客户/个人用户):15120086569 (微信同步) 请注意:本站不提供任何免费的技术咨询服务,为了节约时间,下载前 请确认自己会技术
完美源码 » Python百度图片爬虫2
3630+

本站勉强运行

4638+

用户总数

693+

资源总数

0+

今日更新

2024-8-29

最后更新时间