看到有人用2.7写了一个然后我想我用3写一个吧
本来以为爬下的720P的路径有用呢 后来爬了块1W条信息之后才发现 原来只是个CDN链接
每过一段时间视频后面的参数就会变掉 如果参数不对 视频就没有办法访问到
先当练手吧 后面要做的话 就不抓全部了 每天监控首页前10好了
把每天的前10 DWON下来
#! /usr/bin/env python # -*- coding:utf8 -*- # __author__="leokim" from bs4 import BeautifulSoup import re from html.parser import HTMLParser import urllib.request, urllib.parse, http.cookiejar import http.cookiejar import string import codecs import time import pymysql import json conn=pymysql.connect(host='localhost',user='root',passwd='superhero',db='python_test',port=3306,charset='utf8') cur=conn.cursor()#获取一个游标 hosturl = 'https://www.pornhub.com' login_url = 'https://www.pornhub.com/front/authenticate' def do_login(login_url): headers = { 'Accept':'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding':'gzip, deflate, br', 'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6', 'Connection':'keep-alive', 'Content-Length':'228', 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie':'platform=pc; ss=552610383637701883; bs=09uvdhtd0plqvxtywxkan6d3u9aahn2f; RNLBSERVERID=ded6752; FastPopSessionRequestNumber=1; desired_username=iamsuperhero%7Cjl6668029%40sina.com; performance_timing=home; expiredEnterModalShown=1; _gat=1; _ga=GA1.2.1975907893.1494638097; _gid=GA1.2.1534576924.1494686000; FPSRN=1', 'Host':'www.pornhub.com', 'Origin':'https://www.pornhub.com', 'Referer':'https://www.pornhub.com/login', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'X-Requested-With':'XMLHttpRequest' } postData = {'loginPage': '1', 'redirect': 'RF9jCo8gyhc9dJlS3QxiVEXNRoQCaw8_FuezIgPRrCQ.', 'token': 'MTQ5NDY4NTk0OUGXghl0xCEzQMdNs0i7F3J0fV51kVyaf8XXuqe-IBB7f75TJKnNC0tDnS9uh4r1yC8SSDcZ27q-HIkNAOWrdyo.', 'username': 'iamsuperhero', 'password': '281274954', 'remember_me' : 'on' } postData = urllib.parse.urlencode(postData).encode('utf-8'); request = urllib.request.Request(login_url, postData, headers) response = urllib.request.urlopen(request) def get_page_soup(hosturl): try: cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'), ('Cookie', 'platform=pc; ss=552610383637701883; bs=09uvdhtd0plqvxtywxkan6d3u9aahn2f; RNLBSERVERID=ded6752; FastPopSessionRequestNumber=1; desired_username=iamsuperhero%7Cjl6668029%40sina.com; performance_timing=home; _gat=1; _ga=GA1.2.1975907893.1494638097; _gid=GA1.2.1534576924.1494686000; FPSRN=1; il=v11aG6sqWdiUQBzr4O6rEiEnSCWKAR3eEfgwd6hvbe1GAxNDk3MzY0MzY1MzI3MDY0MzkxO1VpbUxtSmlvUDJPZUZtNFZoN19kQVBkS05mcTNEdF84cXZJRVdDajM1eVUu; expiredEnterModalShown=1')] urllib.request.install_opener(opener) html_bytes = urllib.request.urlopen(hosturl).read() html_string = html_bytes.decode('utf-8') return BeautifulSoup(html_string, 'html.parser') except: return False def get_video_list(soup): try: video_boxs = soup.find("ul",class_="search-video-thumbs").find_all("",class_="videoBox") for video in video_boxs: a = video.find("a") url = hosturl+a["href"]; img = a.find("img") img_src = img["data-mediumthumb"] img_media = img["data-mediabook"] video_720p = get_video_hd_addr(url) #插入数据库 sql = "INSERT INTO `pornhub` (`url`,`img_src`,`img_media`,`video_720p`) VALUES (%s,%s,%s,%s)" cur.execute(sql, (url,img_src,img_media,video_720p)) except: return 'error' # print('###################################################') # print(img_src) # print(img_media) # print(video_720p) # print('###################################################') def get_next_page_url(soup): #page_next last_page_a = soup.find("li",class_="page_next") if(last_page_a is not None): a = last_page_a.find("a") href = a["href"] return href else: return False def check_login(check_url): soup = get_page_soup(check_url) notification = soup.find(id="notificationIcons") return notification def get_video_hd_addr(video_url): print(video_url) soup = get_page_soup(video_url) a_list = soup.find_all("a",class_="downloadBtn") for a in a_list: if(a.contents[2].strip() == '720p'): return a["href"] video_page_url = hosturl+'/video?hd=1&page=1' #登录验证 if(check_login(video_page_url)): print('状态: 已登录.') print('=============================') else: do_login(login_url) url = hosturl+'/video?hd=1&page=1' tag=True while(tag): soup = get_page_soup(url) if(soup): next_page_url = get_next_page_url(soup) if(next_page_url): get_video_list(soup) # video_list = get_video_list(soup) # for video_page in video_list: # print(video_page) url = hosturl+next_page_url else: tag=False print('程序执行完毕.')