pornhub爬虫

看到有人用2.7写了一个然后我想我用3写一个吧 

本来以为爬下的720P的路径有用呢 后来爬了块1W条信息之后才发现 原来只是个CDN链接

每过一段时间视频后面的参数就会变掉 如果参数不对 视频就没有办法访问到

先当练手吧 后面要做的话 就不抓全部了 每天监控首页前10好了

把每天的前10 DWON下来

#! /usr/bin/env python
# -*- coding:utf8 -*-
# __author__="leokim"
from bs4 import BeautifulSoup

import re
from html.parser import HTMLParser
import urllib.request, urllib.parse, http.cookiejar
import http.cookiejar
import string
import codecs
import time
import pymysql
import json

conn=pymysql.connect(host='localhost',user='root',passwd='superhero',db='python_test',port=3306,charset='utf8')
cur=conn.cursor()#获取一个游标

hosturl = 'https://www.pornhub.com'
login_url = 'https://www.pornhub.com/front/authenticate'


def do_login(login_url):
	headers = {
		'Accept':'application/json, text/javascript, */*; q=0.01',
		'Accept-Encoding':'gzip, deflate, br',
		'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
		'Connection':'keep-alive',
		'Content-Length':'228',
		'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
		'Cookie':'platform=pc; ss=552610383637701883; bs=09uvdhtd0plqvxtywxkan6d3u9aahn2f; RNLBSERVERID=ded6752; FastPopSessionRequestNumber=1; desired_username=iamsuperhero%7Cjl6668029%40sina.com; performance_timing=home; expiredEnterModalShown=1; _gat=1; _ga=GA1.2.1975907893.1494638097; _gid=GA1.2.1534576924.1494686000; FPSRN=1',
		'Host':'www.pornhub.com',
		'Origin':'https://www.pornhub.com',
		'Referer':'https://www.pornhub.com/login',
		'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
		'X-Requested-With':'XMLHttpRequest'
	}

	postData = {'loginPage': '1',
	            'redirect': 'RF9jCo8gyhc9dJlS3QxiVEXNRoQCaw8_FuezIgPRrCQ.',
	            'token': 'MTQ5NDY4NTk0OUGXghl0xCEzQMdNs0i7F3J0fV51kVyaf8XXuqe-IBB7f75TJKnNC0tDnS9uh4r1yC8SSDcZ27q-HIkNAOWrdyo.',
	            'username': 'iamsuperhero',
	            'password': '281274954',
	            'remember_me' : 'on'
	            }

	postData = urllib.parse.urlencode(postData).encode('utf-8');

	request = urllib.request.Request(login_url, postData, headers)
	response = urllib.request.urlopen(request)

def get_page_soup(hosturl):
	try:
		cj = http.cookiejar.CookieJar()
		opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
		opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'),
		    ('Cookie', 'platform=pc; ss=552610383637701883; bs=09uvdhtd0plqvxtywxkan6d3u9aahn2f; RNLBSERVERID=ded6752; FastPopSessionRequestNumber=1; desired_username=iamsuperhero%7Cjl6668029%40sina.com; performance_timing=home; _gat=1; _ga=GA1.2.1975907893.1494638097; _gid=GA1.2.1534576924.1494686000; FPSRN=1; il=v11aG6sqWdiUQBzr4O6rEiEnSCWKAR3eEfgwd6hvbe1GAxNDk3MzY0MzY1MzI3MDY0MzkxO1VpbUxtSmlvUDJPZUZtNFZoN19kQVBkS05mcTNEdF84cXZJRVdDajM1eVUu; expiredEnterModalShown=1')]


		urllib.request.install_opener(opener)
		html_bytes = urllib.request.urlopen(hosturl).read()
		html_string = html_bytes.decode('utf-8')

		return BeautifulSoup(html_string, 'html.parser')
	except:
		return False
	

def get_video_list(soup):
	try:
		video_boxs = soup.find("ul",class_="search-video-thumbs").find_all("",class_="videoBox")
		for video in video_boxs:
			a = video.find("a")
			url = hosturl+a["href"];
			img = a.find("img")

			img_src = img["data-mediumthumb"]
			img_media = img["data-mediabook"]
			video_720p = get_video_hd_addr(url)

			#插入数据库
			sql = "INSERT INTO `pornhub` (`url`,`img_src`,`img_media`,`video_720p`) VALUES (%s,%s,%s,%s)"
			cur.execute(sql, (url,img_src,img_media,video_720p))
	except:
		return 'error'

		# print('###################################################')
		# print(img_src)
		# print(img_media)
		# print(video_720p)
		# print('###################################################')

def get_next_page_url(soup):
	#page_next
	last_page_a = soup.find("li",class_="page_next")
	if(last_page_a is not None):
		a = last_page_a.find("a")
		href = a["href"]
		return href
	else:
		return False

def check_login(check_url):
	soup = get_page_soup(check_url)
	notification = soup.find(id="notificationIcons")
	return notification


def get_video_hd_addr(video_url):
	print(video_url)
	soup = get_page_soup(video_url)
	a_list = soup.find_all("a",class_="downloadBtn")
	for a in a_list:
		if(a.contents[2].strip() == '720p'):
			return a["href"]
			

video_page_url = hosturl+'/video?hd=1&page=1'

#登录验证
if(check_login(video_page_url)):
	print('状态: 已登录.')
	print('=============================')
else:
	do_login(login_url)



url = hosturl+'/video?hd=1&page=1'
tag=True
while(tag):
	soup = get_page_soup(url)

	if(soup):
		next_page_url = get_next_page_url(soup)
		if(next_page_url):
			get_video_list(soup)
			# video_list = get_video_list(soup)
			# for video_page in video_list:
			# 	print(video_page)
			url = hosturl+next_page_url
		else:
			tag=False
			print('程序执行完毕.')