from
bs4
import
BeautifulSoup
import
re
from
html.parser
import
HTMLParser
import
urllib.request, urllib.parse, http.cookiejar
import
http.cookiejar
import
string
import
codecs
import
time
import
pymysql
import
json
conn
=
pymysql.connect(host
=
'localhost'
,user
=
'root'
,passwd
=
'superhero'
,db
=
'python_test'
,port
=
3306
,charset
=
'utf8'
)
cur
=
conn.cursor()
hosturl
=
'https://www.pornhub.com'
login_url
=
'https://www.pornhub.com/front/authenticate'
def
do_login(login_url):
headers
=
{
'Accept'
:
'application/json, text/javascript, */*; q=0.01'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.8,en;q=0.6'
,
'Connection'
:
'keep-alive'
,
'Content-Length'
:
'228'
,
'Content-Type'
:
'application/x-www-form-urlencoded; charset=UTF-8'
,
'Cookie'
:
'platform=pc; ss=552610383637701883; bs=09uvdhtd0plqvxtywxkan6d3u9aahn2f; RNLBSERVERID=ded6752; FastPopSessionRequestNumber=1; desired_username=iamsuperhero%7Cjl6668029%40sina.com; performance_timing=home; expiredEnterModalShown=1; _gat=1; _ga=GA1.2.1975907893.1494638097; _gid=GA1.2.1534576924.1494686000; FPSRN=1'
,
'Host'
:
'www.pornhub.com'
,
'Origin'
:
'https://www.pornhub.com'
,
'Referer'
:
'https://www.pornhub.com/login'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
,
'X-Requested-With'
:
'XMLHttpRequest'
}
postData
=
{
'loginPage'
:
'1'
,
'redirect'
:
'RF9jCo8gyhc9dJlS3QxiVEXNRoQCaw8_FuezIgPRrCQ.'
,
'token'
:
'MTQ5NDY4NTk0OUGXghl0xCEzQMdNs0i7F3J0fV51kVyaf8XXuqe-IBB7f75TJKnNC0tDnS9uh4r1yC8SSDcZ27q-HIkNAOWrdyo.'
,
'username'
:
'iamsuperhero'
,
'password'
:
'281274954'
,
'remember_me'
:
'on'
}
postData
=
urllib.parse.urlencode(postData).encode(
'utf-8'
);
request
=
urllib.request.Request(login_url, postData, headers)
response
=
urllib.request.urlopen(request)
def
get_page_soup(hosturl):
try
:
cj
=
http.cookiejar.CookieJar()
opener
=
urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
opener.addheaders
=
[(
'User-Agent'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
),
(
'Cookie'
,
'platform=pc; ss=552610383637701883; bs=09uvdhtd0plqvxtywxkan6d3u9aahn2f; RNLBSERVERID=ded6752; FastPopSessionRequestNumber=1; desired_username=iamsuperhero%7Cjl6668029%40sina.com; performance_timing=home; _gat=1; _ga=GA1.2.1975907893.1494638097; _gid=GA1.2.1534576924.1494686000; FPSRN=1; il=v11aG6sqWdiUQBzr4O6rEiEnSCWKAR3eEfgwd6hvbe1GAxNDk3MzY0MzY1MzI3MDY0MzkxO1VpbUxtSmlvUDJPZUZtNFZoN19kQVBkS05mcTNEdF84cXZJRVdDajM1eVUu; expiredEnterModalShown=1'
)]
urllib.request.install_opener(opener)
html_bytes
=
urllib.request.urlopen(hosturl).read()
html_string
=
html_bytes.decode(
'utf-8'
)
return
BeautifulSoup(html_string,
'html.parser'
)
except
:
return
False
def
get_video_list(soup):
try
:
video_boxs
=
soup.find(
"ul"
,
class_
=
"search-video-thumbs"
).find_all("
",class_="
videoBox")
for
video
in
video_boxs:
a
=
video.find(
"a"
)
url
=
hosturl
+
a[
"href"
];
img
=
a.find(
"img"
)
img_src
=
img[
"data-mediumthumb"
]
img_media
=
img[
"data-mediabook"
]
video_720p
=
get_video_hd_addr(url)
sql
=
"INSERT INTO `pornhub` (`url`,`img_src`,`img_media`,`video_720p`) VALUES (%s,%s,%s,%s)"
cur.execute(sql, (url,img_src,img_media,video_720p))
except
:
return
'error'
def
get_next_page_url(soup):
last_page_a
=
soup.find(
"li"
,
class_
=
"page_next"
)
if
(last_page_a
is
not
None
):
a
=
last_page_a.find(
"a"
)
href
=
a[
"href"
]
return
href
else
:
return
False
def
check_login(check_url):
soup
=
get_page_soup(check_url)
notification
=
soup.find(
id
=
"notificationIcons"
)
return
notification
def
get_video_hd_addr(video_url):
print
(video_url)
soup
=
get_page_soup(video_url)
a_list
=
soup.find_all(
"a"
,
class_
=
"downloadBtn"
)
for
a
in
a_list:
if
(a.contents[
2
].strip()
=
=
'720p'
):
return
a[
"href"
]
video_page_url
=
hosturl
+
'/video?hd=1&page=1'
if
(check_login(video_page_url)):
print
(
'状态: 已登录.'
)
print
(
'============================='
)
else
:
do_login(login_url)
url
=
hosturl
+
'/video?hd=1&page=1'
tag
=
True
while
(tag):
soup
=
get_page_soup(url)
if
(soup):
next_page_url
=
get_next_page_url(soup)
if
(next_page_url):
get_video_list(soup)
url
=
hosturl
+
next_page_url
else
:
tag
=
False
print
(
'程序执行完毕.'
)