In addition to Weibo, there is also WeChat
Please pay attention
WeChat public account
Shulou
2025-01-28 Update From: SLTechnology News&Howtos shulou NAV: SLTechnology News&Howtos > Internet Technology >
Share
Shulou(Shulou.com)06/03 Report--
1. Project process
2. Central scheduling
# Central scheduling
Def main ():
Try:
Total = search ()
Total = int (re.compile ('. *? (d +). *?) .search (total) .group (1))
For i in range (2 focus totalizer 1):
Next_page (I)
Except Exception as e:
Print ('exception')
Finally:
Browser.close ()
3. Simulated query
# query based on keywords
Def search ():
Try:
Browser.get ('https://www.taobao.com/')
# until the search box is loaded
Input_search = wait.until (EC.presence_of_element_located ((By.ID,'q')
# until the search button can be clicked
Submit_button = wait.until (EC.element_to_be_clickable ((By.CLASS_NAME,'btn-search')
Input_search.send_keys (KEYWORDS)
Submit_button.click ()
Total = wait.until (EC.presence_of_element_located ((By.CSS_SELECTOR,'div.wraper div.total')
Get_products ()
Return total.text
Except TimeoutException as e:
Print ('response timeout')
4. The operation of the next page
# next page crawl
Def next_page (index):
Try:
Input = wait.until (EC.presence_of_element_located ((By.CSS_SELECTOR,'#mainsrp-pager div.form > input')
Submit = wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR,'#mainsrp-pager div.form > span.btn.J_Submit')
Input.clear ()
Input.send_keys (index)
Submit.click ()
Wait.until ((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'), str (index)
Get_products ()
Except NoSuchElementException as e:
Print ('element not loaded')
Return next_page (index)
5. Analysis of commodity information.
# get all the goods on one page
Def get_products ():
Wait.until (EC.presence_of_element_located ((By.CSS_SELECTOR, "# mainsrp-itemlist .items"))
Html = browser.page_source
Doc = PyQuery (html)
Items = doc ('.m-itemlist .items. Item') .items ()
For item in items:
If you feel that it is difficult to learn programming, but love the IT industry very much, you can add a test exchange group: 1017539290, join the group to get test learning materials for free!
Product = {
'image':item.find ('.pic-link .img') .attr ('data-src')
'price':float (item.find ('.price'). Text () [2:])
'deal':item.find ('.deal-cnt') .text () [:-3]
'title':item.find ('.title') .text ()
'shop':item.find ('.shop') .text ()
'location':item.find ('.location') .text ()
'keywords':KEYWORDS
}
Save_to_mongo (product=product)
6. Complete code
#! / usr/bin/python
#-*-coding: utf-8-*-
Import pymongo
Import re
From pyquery import PyQuery
From selenium import webdriver
From selenium.webdriver.common.by import By
From selenium.webdriver.support.ui import WebDriverWait
From selenium.webdriver.support import expected_conditions as EC
From selenium.common.exceptions import TimeoutException,NoSuchElementException
From setting import *
Client = pymongo.MongoClient (MONGO_HOST)
Db = client [Mongo _ DB]
Browser = webdriver.Chrome ()
Wait = WebDriverWait (browser,10)
# query based on keywords
Def search ():
Try:
Browser.get ('https://www.taobao.com/')
# until the search box is loaded
Input_search = wait.until (EC.presence_of_element_located ((By.ID,'q')
# until the search button can be clicked
Submit_button = wait.until (EC.element_to_be_clickable ((By.CLASS_NAME,'btn-search')
Input_search.send_keys (KEYWORDS)
Submit_button.click ()
Total = wait.until (EC.presence_of_element_located ((By.CSS_SELECTOR,'div.wraper div.total')
Get_products ()
Return total.text
Except TimeoutException as e:
Print ('response timeout')
# next page crawl if you find it difficult to learn programming, but love the IT industry very much, you can add a test exchange group: 1017539290, join the group to get test learning materials for free!
Def next_page (index):
Try:
Input = wait.until (EC.presence_of_element_located ((By.CSS_SELECTOR,'#mainsrp-pager div.form > input')
Submit = wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR,'#mainsrp-pager div.form > span.btn.J_Submit')
Input.clear ()
Input.send_keys (index)
Submit.click ()
Wait.until ((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'), str (index)
Get_products ()
Except NoSuchElementException as e:
Print ('element not loaded')
Return next_page (index)
# get all the goods on one page
Def get_products ():
Wait.until (EC.presence_of_element_located ((By.CSS_SELECTOR, "# mainsrp-itemlist .items"))
Html = browser.page_source
Doc = PyQuery (html)
Items = doc ('.m-itemlist .items. Item') .items ()
For item in items:
Product = {
'image':item.find ('.pic-link .img') .attr ('data-src')
'price':float (item.find ('.price'). Text () [2:])
'deal':item.find ('.deal-cnt') .text () [:-3]
'title':item.find ('.title') .text ()
'shop':item.find ('.shop') .text ()
'location':item.find ('.location') .text ()
'keywords':KEYWORDS
}
Save_to_mongo (product=product)
# Save to mongoDB
Def save_to_mongo (product):
Try:
If DB [Mongo _ TABLE] .insert (product):
Print ('saved successfully', product)
Except Exception:
Print ('failed to save')
# Central scheduling
Def main ():
Try:
Total = search ()
Total = int (re.compile ('. *? (d +). *?) .search (total) .group (1))
For i in range (2 focus totalizer 1):
Next_page (I)
Except Exception as e:
Print ('exception')
Finally:
Browser.close ()
If _ _ name__=='__main__':
Main ()
7. Running result
Welcome to subscribe "Shulou Technology Information " to get latest news, interesting things and hot topics in the IT industry, and controls the hottest and latest Internet news, technology news and IT industry trends.
Views: 0
*The comments in the above article only represent the author's personal views and do not represent the views and positions of this website. If you have more insights, please feel free to contribute and share.
Continue with the installation of the previous hadoop.First, install zookooper1. Decompress zookoope
"Every 5-10 years, there's a rare product, a really special, very unusual product that's the most un
Spackage com.iteye.lindows.mysql/** * @ author Lindows * * / class TestGroovy {static main (args) {
© 2024 shulou.com SLNews company. All rights reserved.