共计 893 个字符,预计需要花费 3 分钟才能阅读完成。
import requests
import re
import time
headers = {'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
}
for page in range(1, 21):
time.sleep(2)
print(f"正在采集第 {page} 页")
if page == 1:
link = "http://www.netbian.com/index.htm"
else:
link = f"http://www.netbian.com/index_{page}.htm"
print(link)
link_html = requests.get(url=link, headers=headers).text
#print(link_html)
img_id_list = re.findall(r'<a href="/desk/(\d+).htm".*?title', link_html)
#print(img_id_list)
for img_id in img_id_list:
url = f"http://www.netbian.com/desk/{img_id}.htm"
resource = requests.get(url=url, headers=headers)
resource.encoding = 'gbk'
html = resource.text
img_url, title = re.findall('<img src="(.*?)"alt="(.*?)" ', html)[0]
info = re.findall('<img src="(.*?)"alt="(.*?)" ', html)
img_content = requests.get(url=img_url, headers=headers).content
with open("img/" + title + '.jpg', mode='wb') as f:
f.write(img_content)
#print(info)
正文完
发表至: 编程
2024-10-11