import argparse import datetime import httpx import os import sqlite3 import time import urllib.parse from typing import Set, List, Optional, Tuple, Dict from urllib.robotparser import RobotFileParser def get_robots_txt(url: str) -> Optional[RobotFileParser]: """Fetch and parse the robots.txt file for the given URL.""" try: parsed_url = urllib.parse.urlparse(url) robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" rp = RobotFileParser() rp.set_url(robots_url) rp.read() return rp except Exception as e: print(f"Error fetching robots.txt: {e}") return None def is_allowed_by_robots(rp: RobotFileParser, url: str, user_agent: str = "SitemapGenerator") -> bool: """Check if the URL is allowed by the robots.txt file.""" if rp is None: return True return rp.can_fetch(user_agent, url) def is_valid_url(url: str, base_url: str) -> bool: """Check if the URL is valid and part of the target domain.""" parsed_url = urllib.parse.urlparse(url) base_parsed = urllib.parse.urlparse(base_url) return parsed_url.netloc == base_parsed.netloc and parsed_url.scheme in ["http", "https"] def get_links_from_page(client: httpx.Client, url: str, base_url: str, visited: Set[str]) -> List[str]: """Extract all valid links from the given page.""" try: response = client.get(url, follow_redirects=True, timeout=10) if response.status_code == 200: links = [] for link in response.html.links(): full_url = urllib.parse.urljoin(url, link) if is_valid_url(full_url, base_url) and full_url not in visited: links.append(full_url) return links else: print(f"Failed to fetch {url}: {response.status_code}") return [] except Exception as e: print(f"Error fetching {url}: {e}") return [] def generate_sitemap_urls(base_url: str, max_pages: int = 100, delay: float = 1.0, user_agent: str = "SitemapGenerator") -> List[Tuple[str, str]]: """Crawl the website and generate a list of URLs with lastmod dates.""" visited = set() sitemap_urls = [] client = httpx.Client(http2=True, timeout=10, headers={"User-Agent": user_agent}) robots = get_robots_txt(base_url) if not is_allowed_by_robots(robots, base_url): print("Robots.txt disallows crawling this site.") return [] queue = [base_url] pages_crawled = 0 while queue and pages_crawled < max_pages: url = queue.pop(0) if url in visited: continue visited.add(url) pages_crawled += 1 print(f"Crawling: {url}") sitemap_urls.append((url, datetime.datetime.now().isoformat())) links = get_links_from_page(client, url, base_url, visited) queue.extend(links) time.sleep(delay) client.close() return sitemap_urls def write_sitemap(sitemap_urls: List[Tuple[str, str]], output_file: str = "sitemap.xml") -> None: """Write the sitemap URLs to an XML file.""" try: with open(output_file, "w", encoding="utf-8") as f: f.write('\n') f.write('\n') for url, lastmod in sitemap_urls: f.write(f' \n') f.write(f' {url}\n') f.write(f' {lastmod}\n') f.write(f' \n') f.write('') print(f"Sitemap generated successfully: {output_file}") except Exception as e: print(f"Error writing sitemap: {e}") def main(): parser = argparse.ArgumentParser(description="Generate a sitemap.xml for a website.") parser.add_argument("url", help="The base URL of the website to crawl.") parser.add_argument("--max-pages", type=int, default=100, help="Maximum number of pages to crawl.") parser.add_argument("--delay", type=float, default=1.0, help="Delay in seconds between requests.") parser.add_argument("--output", default="sitemap.xml", help="Output file name for the sitemap.") args = parser.parse_args() if not args.url.startswith("http"): print("Error: URL must start with http:// or https://") return print(f"Starting sitemap generation for: {args.url}") sitemap_urls = generate_sitemap_urls(args.url, args.max_pages, args.delay) write_sitemap(sitemap_urls, args.output) if __name__ == "__main__": main()