Get Even More Visitors To Your Blog, Upgrade To A Business Listing >>


import urllib.request
from urllib.parse import urlparse
import re
import time
from multiprocessing.pool import Pool

seed = ''
tocrawl = set([seed])
crawled = set()

class HTMLParser(object):
def __init__(self):
self.title_re = re.compile(r']*>([^')<br> self.link_re = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' )<br><br> def get_links(self, html):<br> return self.link_re.findall(html)<br><br> def get_title(self, html):<br> title = self.title_re.findall(html)<br> if title:<br> title = title.pop().strip()<br> return title<br><br>parser = HTMLParser()<br><br>def get_links(url):<br> try: <br> req = urllib.request.Request( url, data=None, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' } )<br> html = urllib.request.urlopen(req).read().decode('latin-1')<br> title = parser.get_title(html)<br> if title:<br> print(title)<br> links = parser.get_links(html)<br> except Exception as error:<br> links = 0<br> print(error)<br> if links:<br> for link in links:<br> if not urlparse(link).scheme:<br> link = '{}{}'.format(seed, link)<br> yield urlparse(link)<br><br>def crawl(root):<br> #print('Crawling:--> {}'.format(root))<br> queue_size = len(tocrawl)<br> total_crawled = len(crawled)<br> if queue_size and total_crawled:<br> percent_complete = float(total_crawled / queue_size)<br> else:<br> percent_complete = .0<br> print('Crawled: {}\nLinks in queue: {}\n'.format(total_crawled, queue_size))<br> print('{:0.1f}% complete'.format(percent_complete))<br> print('{} active_threads'.format(threading.active_count()))<br> crawled.add(root)<br> for link in get_links(root): <br> if link.netloc == urlparse(seed).netloc and link.geturl() not in crawled:<br> tocrawl.add(link.geturl())<br><br>

This post first appeared on Ricky's Python Notes, please read the originial post: here

Share the post


Subscribe to Ricky's Python Notes

Get updates delivered right to your inbox!

Thank you for your subscription