Webscraping

netloc contains the domain itself it is the “network location” simple sample webscraper

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re

def scrape_emails(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', href=True)
        html_links = [link['href'] for link in links if link['href'].endswith('.html')]
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        emails = re.findall(email_pattern, soup.get_text())
        return emails, html_links

    except request.exceptions.RequestException as e:
        print(f"error acessing url {e}")
        return [],[]

def crawl_website(baseURL, max=10):
    visited = set()
    to_visit = [baseURL]
    all_emails = []
    while to_visit and len(visited) < max:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue
        visited.add(current_url)
        emails, urls = scrape_emails(current_url)
        print(emails)
        all_emails.extend(emails) #add a check here to see if the email is already in the list

        for link in urls:
            full_url = urljoin(baseURL, link)
            if urlparse(baseURL).netloc == urlparse(full_url).netloc and full_url not in visited:
                to_visit.append(full_url)

    return all_emails
baseURL = 'http://10.102.17.32'
emails = crawl_website(baseURL)
for email in emails:
    print(email)