In addition to Weibo, there is also WeChat
Please pay attention
WeChat public account
Shulou
2025-01-29 Update From: SLTechnology News&Howtos shulou NAV: SLTechnology News&Howtos > Development >
Share
Shulou(Shulou.com)06/01 Report--
Editor to share with you how to use python to achieve vertical crawler system, I hope you will learn something after reading this article, let's discuss it together!
Html_downloader
From urllib import requestdef download (url): if url is None: return response = request.urlopen (url) if response.getcode ()! = 200: return None return response.read ()
Html_outeputer
Data_list = [] def collect_data (data): data_list.append (data) def output_html (): fout = open ('output.html') 'w') fout.write ('') for dataitem in data_list: fout.write ('') fout.write ('% s'% dataitem ['url']) fout.write ('% s'% dataitem ['title']) fout.write ('% s'% dataitem ['datetime']) fout .write ('% s'% dataitem ['visitcount']) fout.write ('') fout.close ()
Html_parser
Import refrom bs4 import BeautifulSoupfrom urllib.parse import urljoindef get_new_urls (page_url, soup): new_urls = set () links = soup.find_all ('axiom, href=re.compile (r "/\ w+/page /\ w+/page\ .htm")) for link in links: new_url = link [' href'] new_full_url = urljoin (page_url New_url) new_urls.add (new_full_url) return new_urlsdef get_new_data (page_url, soup): res_data = {} title_node = soup.find ('H2, class_='arti-title') if title_node is None: return res_data res_data ['title'] = title_node.get_text () datetime_node = soup.find (' span' Class_='arti-update') res_data ['datetime'] = datetime_node.get_text () visitcount_node = soup.find (' span', class_='WP_VisitCount') res_data ['visitcount'] = visitcount_node.get_text () res_data [' url'] = page_url return res_datadef parse (page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup (html_cont 'html.parser', from_encoding='utf-8') new_urls = get_new_urls (page_url, soup) new_data = get_new_data (page_url, soup) return new_urls, new_data
Spider_main
Import urls_manager, html_downloader,\ html_parser, html_outputerdef craw (root_url): count = 1 urls_manager.add_new_url (root_url) # start the crawler loop while urls_manager.has_new_url (): new_url = urls_manager.get_new_url () print ('craw% d:% s'% (count) New_url) html_cont = html_downloader.download (new_url) new_urls, new_data = html_parser.parse (new_url) Html_cont) urls_manager.add_new_urls (new_urls) if new_data: html_outputer.collect_data (new_data) if count = = 10: break count = count + 1 html_outputer.output_html () if _ _ name__ ='_ _ main__': root_url = 'http://news.zzuli.edu.cn/' craw (root_url) import urls_manager Html_downloader,\ html_parser, html_outputerdef craw (root_url): count = 1 urls_manager.add_new_url (root_url) # start the crawler loop while urls_manager.has_new_url (): new_url = urls_manager.get_new_url () print ('craw% d:% s'% (count) New_url) html_cont = html_downloader.download (new_url) new_urls, new_data = html_parser.parse (new_url) Html_cont) urls_manager.add_new_urls (new_urls) if new_data: html_outputer.collect_data (new_data) if count = = 10: break count = count + 1 html_outputer.output_html () if _ _ name__ ='_ _ main__': root_url = 'http://news.zzuli.edu.cn/' craw (root_url)
Test_64
From bs4 import BeautifulSoupimport rehtml_doc = "The Dormouse's storyThe Dormouse's story
Once upon a time there were three little sisters; and their names wereElsie,Lacie andTillie;and they lived at the bottom of a well.
...
"" soup = BeautifulSoup (html_doc, 'html.parser') print (' get all links') links = soup.find_all ('a') for link in links: print (link.name, link ['href'], link.get_text () print (' get lacie links') link_node = soup.find ('getting, href=' http://example.com/lacie')print(link_node.name, link_node [' href']] Link_node.get_text () print ('regular match') link_node = soup.find ('a match, href=re.compile (ritual')) print (link_node.name, link_node ['href'], link_node.get_text ()) print (' get P paragraph text') p_node = soup.find ('paired, class_='title') print (p_node.name, p_node.get_text ())
Urls_manager
New_urls = set () old_urls = set () def add_new_url (url): if url is None: return if url not in new_urls and url not in old_urls: new_urls.add (url) def add_new_urls (urls): if urls is None or len (urls) = 0: return for url in urls: add_new_url (url) def get_new_url (): New_url = new_urls.pop () old_urls.add (new_url) return new_urldef has_new_url (): return len (new_urls)! = 0 finished reading this article I believe you have a certain understanding of "how to use python to achieve vertical crawler system". If you want to know more about it, you are welcome to follow the industry information channel. Thank you for reading!
Welcome to subscribe "Shulou Technology Information " to get latest news, interesting things and hot topics in the IT industry, and controls the hottest and latest Internet news, technology news and IT industry trends.
Views: 0
*The comments in the above article only represent the author's personal views and do not represent the views and positions of this website. If you have more insights, please feel free to contribute and share.
Continue with the installation of the previous hadoop.First, install zookooper1. Decompress zookoope
"Every 5-10 years, there's a rare product, a really special, very unusual product that's the most un
© 2024 shulou.com SLNews company. All rights reserved.