看到有人用2.7写了一个然后我想我用3写一个吧
本来以为爬下的720P的路径有用呢 后来爬了块1W条信息之后才发现 原来只是个CDN链接
每过一段时间视频后面的参数就会变掉 如果参数不对 视频就没有办法访问到
先当练手吧 后面要做的话 就不抓全部了 每天监控首页前10好了
把每天的前10 DWON下来
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | #! /usr/bin/env python # -*- coding:utf8 -*- # __author__="leokim" from bs4 import BeautifulSoup import re from html.parser import HTMLParser import urllib.request, urllib.parse, http.cookiejar import http.cookiejar import string import codecs import time import pymysql import json conn = pymysql.connect(host = 'localhost' ,user = 'root' ,passwd = 'superhero' ,db = 'python_test' ,port = 3306 ,charset = 'utf8' ) cur = conn.cursor() #获取一个游标 hosturl = 'https://www.pornhub.com' login_url = 'https://www.pornhub.com/front/authenticate' def do_login(login_url): headers = { 'Accept' : 'application/json, text/javascript, */*; q=0.01' , 'Accept-Encoding' : 'gzip, deflate, br' , 'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6' , 'Connection' : 'keep-alive' , 'Content-Length' : '228' , 'Content-Type' : 'application/x-www-form-urlencoded; charset=UTF-8' , 'Cookie' : 'platform=pc; ss=552610383637701883; bs=09uvdhtd0plqvxtywxkan6d3u9aahn2f; RNLBSERVERID=ded6752; FastPopSessionRequestNumber=1; desired_username=iamsuperhero%7Cjl6668029%40sina.com; performance_timing=home; expiredEnterModalShown=1; _gat=1; _ga=GA1.2.1975907893.1494638097; _gid=GA1.2.1534576924.1494686000; FPSRN=1' , 'Host' : 'www.pornhub.com' , 'Origin' : 'https://www.pornhub.com' , 'Referer' : 'https://www.pornhub.com/login' , 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' , 'X-Requested-With' : 'XMLHttpRequest' } postData = { 'loginPage' : '1' , 'redirect' : 'RF9jCo8gyhc9dJlS3QxiVEXNRoQCaw8_FuezIgPRrCQ.' , 'token' : 'MTQ5NDY4NTk0OUGXghl0xCEzQMdNs0i7F3J0fV51kVyaf8XXuqe-IBB7f75TJKnNC0tDnS9uh4r1yC8SSDcZ27q-HIkNAOWrdyo.' , 'username' : 'iamsuperhero' , 'password' : '281274954' , 'remember_me' : 'on' } postData = urllib.parse.urlencode(postData).encode( 'utf-8' ); request = urllib.request.Request(login_url, postData, headers) response = urllib.request.urlopen(request) def get_page_soup(hosturl): try : cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) opener.addheaders = [( 'User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36' ), ( 'Cookie' , 'platform=pc; ss=552610383637701883; bs=09uvdhtd0plqvxtywxkan6d3u9aahn2f; RNLBSERVERID=ded6752; FastPopSessionRequestNumber=1; desired_username=iamsuperhero%7Cjl6668029%40sina.com; performance_timing=home; _gat=1; _ga=GA1.2.1975907893.1494638097; _gid=GA1.2.1534576924.1494686000; FPSRN=1; il=v11aG6sqWdiUQBzr4O6rEiEnSCWKAR3eEfgwd6hvbe1GAxNDk3MzY0MzY1MzI3MDY0MzkxO1VpbUxtSmlvUDJPZUZtNFZoN19kQVBkS05mcTNEdF84cXZJRVdDajM1eVUu; expiredEnterModalShown=1' )] urllib.request.install_opener(opener) html_bytes = urllib.request.urlopen(hosturl).read() html_string = html_bytes.decode( 'utf-8' ) return BeautifulSoup(html_string, 'html.parser' ) except : return False def get_video_list(soup): try : video_boxs = soup.find( "ul" , class_ = "search-video-thumbs" ).find_all(" ",class_=" videoBox") for video in video_boxs: a = video.find( "a" ) url = hosturl + a[ "href" ]; img = a.find( "img" ) img_src = img[ "data-mediumthumb" ] img_media = img[ "data-mediabook" ] video_720p = get_video_hd_addr(url) #插入数据库 sql = "INSERT INTO `pornhub` (`url`,`img_src`,`img_media`,`video_720p`) VALUES (%s,%s,%s,%s)" cur.execute(sql, (url,img_src,img_media,video_720p)) except : return 'error' # print('###################################################') # print(img_src) # print(img_media) # print(video_720p) # print('###################################################') def get_next_page_url(soup): #page_next last_page_a = soup.find( "li" , class_ = "page_next" ) if (last_page_a is not None ): a = last_page_a.find( "a" ) href = a[ "href" ] return href else : return False def check_login(check_url): soup = get_page_soup(check_url) notification = soup.find( id = "notificationIcons" ) return notification def get_video_hd_addr(video_url): print (video_url) soup = get_page_soup(video_url) a_list = soup.find_all( "a" , class_ = "downloadBtn" ) for a in a_list: if (a.contents[ 2 ].strip() = = '720p' ): return a[ "href" ] video_page_url = hosturl + '/video?hd=1&page=1' #登录验证 if (check_login(video_page_url)): print ( '状态: 已登录.' ) print ( '=============================' ) else : do_login(login_url) url = hosturl + '/video?hd=1&page=1' tag = True while (tag): soup = get_page_soup(url) if (soup): next_page_url = get_next_page_url(soup) if (next_page_url): get_video_list(soup) # video_list = get_video_list(soup) # for video_page in video_list: # print(video_page) url = hosturl + next_page_url else : tag = False print ( '程序执行完毕.' ) |