看到有人用2.7写了一个然后我想我用3写一个吧
本来以为爬下的720P的路径有用呢 后来爬了块1W条信息之后才发现 原来只是个CDN链接
每过一段时间视频后面的参数就会变掉 如果参数不对 视频就没有办法访问到
先当练手吧 后面要做的话 就不抓全部了 每天监控首页前10好了
把每天的前10 DWON下来
#! /usr/bin/env python
# -*- coding:utf8 -*-
# __author__="leokim"
from bs4 import BeautifulSoup
import re
from html.parser import HTMLParser
import urllib.request, urllib.parse, http.cookiejar
import http.cookiejar
import string
import codecs
import time
import pymysql
import json
conn=pymysql.connect(host='localhost',user='root',passwd='superhero',db='python_test',port=3306,charset='utf8')
cur=conn.cursor()#获取一个游标
hosturl = 'https://www.pornhub.com'
login_url = 'https://www.pornhub.com/front/authenticate'
def do_login(login_url):
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Connection':'keep-alive',
'Content-Length':'228',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie':'platform=pc; ss=552610383637701883; bs=09uvdhtd0plqvxtywxkan6d3u9aahn2f; RNLBSERVERID=ded6752; FastPopSessionRequestNumber=1; desired_username=iamsuperhero%7Cjl6668029%40sina.com; performance_timing=home; expiredEnterModalShown=1; _gat=1; _ga=GA1.2.1975907893.1494638097; _gid=GA1.2.1534576924.1494686000; FPSRN=1',
'Host':'www.pornhub.com',
'Origin':'https://www.pornhub.com',
'Referer':'https://www.pornhub.com/login',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
postData = {'loginPage': '1',
'redirect': 'RF9jCo8gyhc9dJlS3QxiVEXNRoQCaw8_FuezIgPRrCQ.',
'token': 'MTQ5NDY4NTk0OUGXghl0xCEzQMdNs0i7F3J0fV51kVyaf8XXuqe-IBB7f75TJKnNC0tDnS9uh4r1yC8SSDcZ27q-HIkNAOWrdyo.',
'username': 'iamsuperhero',
'password': '281274954',
'remember_me' : 'on'
}
postData = urllib.parse.urlencode(postData).encode('utf-8');
request = urllib.request.Request(login_url, postData, headers)
response = urllib.request.urlopen(request)
def get_page_soup(hosturl):
try:
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'),
('Cookie', 'platform=pc; ss=552610383637701883; bs=09uvdhtd0plqvxtywxkan6d3u9aahn2f; RNLBSERVERID=ded6752; FastPopSessionRequestNumber=1; desired_username=iamsuperhero%7Cjl6668029%40sina.com; performance_timing=home; _gat=1; _ga=GA1.2.1975907893.1494638097; _gid=GA1.2.1534576924.1494686000; FPSRN=1; il=v11aG6sqWdiUQBzr4O6rEiEnSCWKAR3eEfgwd6hvbe1GAxNDk3MzY0MzY1MzI3MDY0MzkxO1VpbUxtSmlvUDJPZUZtNFZoN19kQVBkS05mcTNEdF84cXZJRVdDajM1eVUu; expiredEnterModalShown=1')]
urllib.request.install_opener(opener)
html_bytes = urllib.request.urlopen(hosturl).read()
html_string = html_bytes.decode('utf-8')
return BeautifulSoup(html_string, 'html.parser')
except:
return False
def get_video_list(soup):
try:
video_boxs = soup.find("ul",class_="search-video-thumbs").find_all("",class_="videoBox")
for video in video_boxs:
a = video.find("a")
url = hosturl+a["href"];
img = a.find("img")
img_src = img["data-mediumthumb"]
img_media = img["data-mediabook"]
video_720p = get_video_hd_addr(url)
#插入数据库
sql = "INSERT INTO `pornhub` (`url`,`img_src`,`img_media`,`video_720p`) VALUES (%s,%s,%s,%s)"
cur.execute(sql, (url,img_src,img_media,video_720p))
except:
return 'error'
# print('###################################################')
# print(img_src)
# print(img_media)
# print(video_720p)
# print('###################################################')
def get_next_page_url(soup):
#page_next
last_page_a = soup.find("li",class_="page_next")
if(last_page_a is not None):
a = last_page_a.find("a")
href = a["href"]
return href
else:
return False
def check_login(check_url):
soup = get_page_soup(check_url)
notification = soup.find(id="notificationIcons")
return notification
def get_video_hd_addr(video_url):
print(video_url)
soup = get_page_soup(video_url)
a_list = soup.find_all("a",class_="downloadBtn")
for a in a_list:
if(a.contents[2].strip() == '720p'):
return a["href"]
video_page_url = hosturl+'/video?hd=1&page=1'
#登录验证
if(check_login(video_page_url)):
print('状态: 已登录.')
print('=============================')
else:
do_login(login_url)
url = hosturl+'/video?hd=1&page=1'
tag=True
while(tag):
soup = get_page_soup(url)
if(soup):
next_page_url = get_next_page_url(soup)
if(next_page_url):
get_video_list(soup)
# video_list = get_video_list(soup)
# for video_page in video_list:
# print(video_page)
url = hosturl+next_page_url
else:
tag=False
print('程序执行完毕.')