1 爬去数据代码
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
#coding=utf-8from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as EC#加载TimeoutException模块,用于进行超时处理from selenium.common.exceptions import TimeoutException#正则表达式import re,sysfrom pyquery import PyQuery as pqfrom config import *#加载数据库操作模块import mysqlOpdriver=webdriver.Chrome()#使用phantomJs浏览器驱动#driver=webdriver.PhantomJS()driver.get("https://www.taobao.com")driver.set_window_size(1400,900)wait=WebDriverWait(driver, 10)def search(): try: input=wait.until(EC.presence_of_element_located(By.CSS_SELECTOR,"#q")) submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button"))) input.clear() input.send_keys("美食") submit.click() #获取第一页的数据 get_goods() except TimeoutException : search()#获取总页码def get_total(): #查找总页码 total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total"))) return total.text#翻页def next_page(page): try: input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))) input.clear() input.send_keys(page) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page))) #获取当前页的数据 count=get_goods() except TimeoutException: next_page(page) return countdef get_goods(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item"))) #mainsrp-itemlist > div > div > div:nth-child(1) > div.item.J_MouserOnverReq.item-ad #mainsrp-itemlist > div > div > div:nth-child(1) html=driver.page_source doc=pq(html) items=doc("#mainsrp-itemlist .items .item").items() count=0 for item in items: goods={ 'image':item.find('.pic .img').attr('src'), 'price':item.find('.price').text(), 'deal' :item.find('.deal-cnt').text()[:-3], 'title':item.find('.title').text(), 'shop':item.find('.shop').text(), 'location':item.find('.location').text() } print(goods) #将数据插入数据库 mysqlOp.mysqlOp(goods) count+=1 return countdef main(): search() total=get_total() #使用正则表达式提取页码 total=int(re.compile(r"(\d+)").search(total).group(1)) print(total) total_count=0 for i in range(2,total+1): count=next_page(i) total_count +=count print(total_count) if __name__=="__main__": main() |
2 存入到mysql中
创建一个mysqlOp.py的文件
|
1
2
3
4
5
6
7
8
9
|
#coding=utf-8from pymysql import *def mysqlOp(goods): conn=connect(host='127.0.0.1', port=3306, user='root', passwd='1qaz2wsx#EDC', db='taobao_meishi', charset='utf8') cursor=conn.cursor() cursor.execute("insert into goods(image,price,deal,title,shop,location) values(%s,%s,%s,%s,%s,%s)",(goods['image'],goods['price'],goods['deal'],goods['title'],goods['shop'],goods['location'])) conn.commit() cursor.close() conn.close()
转自:https://www.cnblogs.com/yinliang-liang/p/9391746.html |