51无聊谢了个爬虫玩
python写这类小东西真的很方便
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | __author__ = 'LeoKim' from bs4 import BeautifulSoup import re import urllib.request, urllib.parse, http.cookiejar import json import time import pymysql conn = pymysql.connect(host = 'localhost' ,user = 'root' ,passwd = 'superhero' ,db = 'python_test' ,port = 3306 ,charset = 'utf8' ) cur = conn.cursor() #获取一个游标 #通过链接获取每页的小区名 def getVillage(url): cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) opener.addheaders = [( 'User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36' ), ( 'Cookie' , 'select_city=320100; lianjia_uuid=c73af582-9ed7-42ed-9738-bbd4688c67e0; UM_distinctid=15bb9f33ca387c-0ac15874ad5d0d-6a11157a-1fa400-15bb9f33ca4a02; _jzqckmp=1; all-lj=c28812af28ef34a41ba2474a2b5c52c2; _jzqx=1.1493473537.1493544561.2.jzqsr=nj%2Elianjia%2Ecom|jzqct=/ershoufang/gulou/.jzqsr=nj%2Elianjia%2Ecom|jzqct=/xiaoqu/pg1/; _gat=1; _gat_past=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; _smt_uid=59049861.8870a59; CNZZDATA1253492138=835595246-1493470448-null%7C1493541950; CNZZDATA1254525948=1922726511-1493470772-null%7C1493540995; CNZZDATA1255633284=630946367-1493469955-null%7C1493543402; CNZZDATA1255604082=270979082-1493468920-null%7C1493544528; _qzja=1.1520598967.1493473405458.1493480837509.1493544561423.1493544849473.1493544851953.0.0.0.29.3; _qzjb=1.1493544561423.10.0.0.0; _qzjc=1; _qzjto=10.1.0; _jzqa=1.2414222906473966000.1493473537.1493480838.1493544561.3; _jzqc=1; _jzqb=1.10.10.1493544561.1; _ga=GA1.2.1108117219.1493473408; _gid=GA1.2.2091828031.1493544853; lianjia_ssid=5c8ebd96-81f4-4430-bfda-6d941fcb8663' )] urllib.request.install_opener(opener) html_bytes = urllib.request.urlopen(url).read() html_string = html_bytes.decode( 'utf-8' ) return html_string def start(start_url): html_doc = getVillage(start_url) soup = BeautifulSoup(html_doc, 'html.parser' ) #获取所有页数和现在页数 totalPageNoDiv = soup.find( "div" , "house-lst-page-box" ) Page = eval (totalPageNoDiv.attrs[ 'page-data' ]) totalPageNo = Page[ 'totalPage' ] curPage = Page[ 'curPage' ] print ( '当前正在抓取第' + str (curPage) + '页,共' + str (totalPageNo) + '页.' ) #获取小区内容 divs = soup.find_all( "div" , "title" ) for div in divs: a_tag = div.find( "a" ,target = "_blank" ) if (a_tag): #插入数据库 sql = "INSERT INTO `village` (`name`) VALUES (%s)" cur.execute(sql, (a_tag.string)) curPage = curPage + 1 ; if (totalPageNo = = curPage - 1 ): print ( '执行完毕.' ) else : time.sleep( 10 ) start_url = "http://nj.lianjia.com/xiaoqu/pg" + str (curPage) start(start_url) totalPageNo = 1 curPage = 1 start_url = "http://nj.lianjia.com/xiaoqu/pg" + str (curPage) start(start_url) cur.close() #关闭游标 conn.close() #释放数据库资源 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | __author__ = 'LeoKim' from bs4 import BeautifulSoup import pymysql import urllib.request, urllib.parse, http.cookiejar from urllib import parse import pymysql conn = pymysql.connect(host = 'localhost' ,user = 'root' ,passwd = 'superhero' ,db = 'python_test' ,port = 3306 ,charset = 'utf8' ) cur = conn.cursor() #获取一个游标 def getgeohash(keyword): key = { 'keyword' :keyword } url = 'https://mainsite-restapi.ele.me/v2/pois?extras%5B%5D=count&geohash=wtsm0ss7yfj8&limit=20&type=nearby&' + parse.urlencode(key) cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) opener.addheaders = [( 'User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36' ), ( 'Cookie' , 'ubt_ssid=but8xnmtkpfrbvypd9z3hxaa5i8ugmj0_2017-04-29; _utrace=edd9bb6de13caed667d2cf273d73fc0a_2017-04-29' )] urllib.request.install_opener(opener) html_bytes = urllib.request.urlopen(url).read() html_string = html_bytes.decode( 'utf-8' ) soup = BeautifulSoup(html_string, 'html.parser' ) try : info = eval (soup.prettify()) if len (info) and info is not None : return info[ 0 ] else : return 'error' except : return 'error' sql = "SELECT id,name FROM `village` where geohash is null" cur.execute(sql) data = cur.fetchall() for d in data: print (d[ 0 ]) geohash = '' latitude = '' longitude = '' gh = getgeohash(d[ 1 ]) if gh = = 'error' : geohash = 'error' latitude = '' longitude = '' else : geohash = gh[ 'geohash' ] latitude = gh[ 'latitude' ] longitude = gh[ 'longitude' ] print (geohash,latitude,longitude) # gh['geohash'] is None sql = "UPDATE `village` SET geohash=%s,latitude=%s,longitude=%s where id=%s" cur.execute(sql, (geohash,latitude,longitude,d[ 0 ])) cur.close() #关闭游标 conn.close() #释放数据库资源 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | __author__ = 'LeoKim' from bs4 import BeautifulSoup import pymysql import urllib.request, urllib.parse, http.cookiejar from urllib import parse import pymysql import json import re conn = pymysql.connect(host = 'localhost' ,user = 'root' ,passwd = 'superhero' ,db = 'python_test' ,port = 3306 ,charset = 'utf8' ) cur = conn.cursor() #获取一个游标 def getstore(village_id,geohash,latitude,longitude,limit): key = { 'geohash' :geohash, 'latitude' :latitude, 'longitude' :longitude, 'limit' :limit } url = 'https://mainsite-restapi.ele.me/shopping/restaurants?extras%5B%5D=activities&offset=0&terminal=web' + parse.urlencode(key) cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) opener.addheaders = [( 'User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36' ), ( 'Cookie' , 'ubt_ssid=but8xnmtkpfrbvypd9z3hxaa5i8ugmj0_2017-04-29; _utrace=edd9bb6de13caed667d2cf273d73fc0a_2017-04-29' )] urllib.request.install_opener(opener) html_bytes = urllib.request.urlopen(url).read() html_string = html_bytes.decode( 'utf-8' ) soup = BeautifulSoup(html_string, 'html.parser' ) info = soup.prettify() jsonData = json.loads(info) for data in jsonData: print (data[ 'id' ]) print (village_id) print (data[ 'name' ]) print (data[ 'recent_order_num' ]) print (data[ 'address' ]) print (data[ 'order_lead_time' ]) print (data[ 'float_delivery_fee' ]) average_cost = 0 if 'average_cost' in data: cost = re.findall(r '\d+' , data[ 'average_cost' ]) average_cost = cost[ 0 ] print (average_cost) print (data[ 'rating' ]) print ( '---------------------------------------------' ) shop_id = data[ 'id' ] name = data[ 'name' ] address = data[ 'address' ] recent_order_num = data[ 'recent_order_num' ] order_lead_time = data[ 'order_lead_time' ] float_delivery_fee = data[ 'float_delivery_fee' ] rating = data[ 'rating' ] sql = "INSERT INTO `store` (`shop_id`,`village_id`,`name`,`address`,`recent_order_num`,`order_lead_time`,`float_delivery_fee`, `average_cost`, `rating`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)" cur.execute(sql, (shop_id,village_id,name, address, recent_order_num, order_lead_time, float_delivery_fee, average_cost, rating)) # getstore('wtst84g4g0u','31.91988','118.83238',30) sql = "SELECT id,name,geohash,latitude,longitude FROM `village` where id >482 and geohash is not null" cur.execute(sql) data = cur.fetchall() for d in data: village_id = d[ 0 ] geohash = d[ 2 ] latitude = d[ 3 ] longitude = d[ 4 ] getstore(village_id,geohash,latitude,longitude, 30 ) cur.close() #关闭游标 conn.close() #释放数据库资源 |