小涩席 发表于 2020-2-22 03:36

小白原创Python爬虫,爬取美图某某网站

【声明】:小白写了只为学习技能,请勿用到商业用途,如有侵权、请告知后删除,谢谢!!!
    本事是小白,学习了一会Python爬虫,自己随便捣鼓了一下,不知道这么写对不对,效果是实现了,但是不知道有没有更简洁的写法,希望大神提出介意,小白愿意虚心接受。
    多的不说,直接上效果图和代码。{:6_197:}{:6_208:}
# coding = utf8
# https://www.meitulu.com/t/xinggan/
# created by XSX


import requests
from lxml import etree
import random
import os
import time


def Randomheader():
    # random choice User-Agent
    user_agent_list = [
      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
      'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
      'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
      'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
      'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
      'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
      'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
      'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    ]
    user_agent = random.choice(user_agent_list)
    headers = {'User-Agent': user_agent}
    return headers


def GetPhotopage(url, headers):
    r = requests.get(url, headers=headers)
    r.encoding = r.apparent_encoding
    html = etree.HTML(r.text)
    Photospagelists = html.xpath('//div[@class="boxs"]/ul/li/a/@href')
    print(Photospagelists)
    return Photospagelists


def GetPagelinks(Photospagelists, headers):
    # get Double lists
    PageLinks1 = []
    for Photospagelist in Photospagelists:
      r1 = requests.get(Photospagelist, headers=headers)
      html1 = etree.HTML(r1.text)
      PageLinks = html1.xpath('//div[@id="pages"]/a/@href')
      PageLinks1.append(PageLinks)
    print(PageLinks1)
    return PageLinks1


def GetDownlaodJPGlist(PageLinks1):
    listCatch = []
    JPGlinks = []
    for Jpglists in PageLinks1:
      listCatch.append(Jpglists)
      for listCatch1 in listCatch:
            for listCatch2 in listCatch1:
                datacacth1 = "https://www.meitulu.com" + str(listCatch2)
                JPGlinks.append(datacacth1)
    print(JPGlinks)
    return JPGlinks


def DownlaodJPG(JPGlinks, headers):
    if not os.path.exists("./JPG"):
      os.mkdir("./JPG")
    for JPGlink in JPGlinks:
      time.sleep(0.05)
      r3 = requests.get(JPGlink, headers=headers)
      html2 = etree.HTML(r3.text)
      jpglistss = html2.xpath('//div[@class="content"]/center/img/@src')
      for jpglists in jpglistss:
            name = str(jpglists).split('/')[-1]
            filename = str(jpglists).split('/')[-2]
            if not os.path.exists("./JPG/" + filename):
                os.mkdir("./JPG/" + filename)
            time.sleep(0.1)
            r4 = requests.get(jpglists, headers=headers)
            with open('./JPG/' + filename + '/' + name, 'wb')as f:
                f.write(r4.content)
      print("Process Success!")


if __name__ == '__main__':
    url1 = "https://www.meitulu.com/t/xinggan/"
    DownlaodJPG(GetDownlaodJPGlist(GetPagelinks(GetPhotopage(url1, Randomheader()), Randomheader())), Randomheader())

kghong 发表于 2020-2-29 19:24

谢谢分享,

wei1992658 发表于 2020-3-3 00:57

谢谢分享了

hello889 发表于 2020-3-6 12:13

学习了 ,多谢!!好

linjj1020 发表于 2020-3-14 23:05

学习分享。大佬爱你哟!

linjj1020 发表于 2020-3-14 23:18

我找不到下载的文件

915宝宝老公 发表于 2020-3-31 08:45

学习喽。楼主好帅哦

NHFXy56 发表于 2022-2-22 09:42

论坛的繁荣离不开你们的热心分享

yOdGemW 发表于 2022-2-25 14:49

感谢楼主

mbkvpGML4 发表于 2022-2-25 14:55

感谢楼主
页: [1] 2 3 4 5 6 7 8
查看完整版本: 小白原创Python爬虫,爬取美图某某网站