【声明】:小白写了只为学习技能,请勿用到商业用途,如有侵权、请告知后删除,谢谢!!!
本事是小白,学习了一会Python爬虫,自己随便捣鼓了一下,不知道这么写对不对,效果是实现了,但是不知道有没有更简洁的写法,希望大神提出介意,小白愿意虚心接受。
多的不说,直接上效果图和代码。 
[Python] 纯文本查看 复制代码 # coding = utf8
# https://www.meitulu.com/t/xinggan/
# created by XSX
import requests
from lxml import etree
import random
import os
import time
def Randomheader():
# random choice User-Agent
user_agent_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
]
user_agent = random.choice(user_agent_list)
headers = {'User-Agent': user_agent}
return headers
def GetPhotopage(url, headers):
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
html = etree.HTML(r.text)
Photospagelists = html.xpath('//div[@class="boxs"]/ul/li/a/@href')
print(Photospagelists)
return Photospagelists
def GetPagelinks(Photospagelists, headers):
# get Double lists
PageLinks1 = []
for Photospagelist in Photospagelists:
r1 = requests.get(Photospagelist, headers=headers)
html1 = etree.HTML(r1.text)
PageLinks = html1.xpath('//div[@id="pages"]/a/@href')
PageLinks1.append(PageLinks)
print(PageLinks1)
return PageLinks1
def GetDownlaodJPGlist(PageLinks1):
listCatch = []
JPGlinks = []
for Jpglists in PageLinks1:
listCatch.append(Jpglists)
for listCatch1 in listCatch:
for listCatch2 in listCatch1:
datacacth1 = "https://www.meitulu.com" + str(listCatch2)
JPGlinks.append(datacacth1)
print(JPGlinks)
return JPGlinks
def DownlaodJPG(JPGlinks, headers):
if not os.path.exists("./JPG"):
os.mkdir("./JPG")
for JPGlink in JPGlinks:
time.sleep(0.05)
r3 = requests.get(JPGlink, headers=headers)
html2 = etree.HTML(r3.text)
jpglistss = html2.xpath('//div[@class="content"]/center/img/@src')
for jpglists in jpglistss:
name = str(jpglists).split('/')[-1]
filename = str(jpglists).split('/')[-2]
if not os.path.exists("./JPG/" + filename):
os.mkdir("./JPG/" + filename)
time.sleep(0.1)
r4 = requests.get(jpglists, headers=headers)
with open('./JPG/' + filename + '/' + name, 'wb')as f:
f.write(r4.content)
print("Process Success!")
if __name__ == '__main__':
url1 = "https://www.meitulu.com/t/xinggan/"
DownlaodJPG(GetDownlaodJPGlist(GetPagelinks(GetPhotopage(url1, Randomheader()), Randomheader())), Randomheader())
|