import argparse
import datetime
import httpx
import os
import sqlite3
import time
import urllib.parse
from typing import Set, List, Optional, Tuple, Dict
from urllib.robotparser import RobotFileParser

def get_robots_txt(url: str) -> Optional[RobotFileParser]:
    """Fetch and parse the robots.txt file for the given URL."""
    try:
        parsed_url = urllib.parse.urlparse(url)
        robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
        rp = RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        return rp
    except Exception as e:
        print(f"Error fetching robots.txt: {e}")
        return None

def is_allowed_by_robots(rp: RobotFileParser, url: str, user_agent: str = "SitemapGenerator") -> bool:
    """Check if the URL is allowed by the robots.txt file."""
    if rp is None:
        return True
    return rp.can_fetch(user_agent, url)

def is_valid_url(url: str, base_url: str) -> bool:
    """Check if the URL is valid and part of the target domain."""
    parsed_url = urllib.parse.urlparse(url)
    base_parsed = urllib.parse.urlparse(base_url)
    return parsed_url.netloc == base_parsed.netloc and parsed_url.scheme in ["http", "https"]

def get_links_from_page(client: httpx.Client, url: str, base_url: str, visited: Set[str]) -> List[str]:
    """Extract all valid links from the given page."""
    try:
        response = client.get(url, follow_redirects=True, timeout=10)
        if response.status_code == 200:
            links = []
            for link in response.html.links():
                full_url = urllib.parse.urljoin(url, link)
                if is_valid_url(full_url, base_url) and full_url not in visited:
                    links.append(full_url)
            return links
        else:
            print(f"Failed to fetch {url}: {response.status_code}")
            return []
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return []

def generate_sitemap_urls(base_url: str, max_pages: int = 100, delay: float = 1.0, user_agent: str = "SitemapGenerator") -> List[Tuple[str, str]]:
    """Crawl the website and generate a list of URLs with lastmod dates."""
    visited = set()
    sitemap_urls = []
    client = httpx.Client(http2=True, timeout=10, headers={"User-Agent": user_agent})
    robots = get_robots_txt(base_url)

    if not is_allowed_by_robots(robots, base_url):
        print("Robots.txt disallows crawling this site.")
        return []

    queue = [base_url]
    pages_crawled = 0

    while queue and pages_crawled < max_pages:
        url = queue.pop(0)
        if url in visited:
            continue
        visited.add(url)
        pages_crawled += 1
        print(f"Crawling: {url}")

        sitemap_urls.append((url, datetime.datetime.now().isoformat()))

        links = get_links_from_page(client, url, base_url, visited)
        queue.extend(links)

        time.sleep(delay)

    client.close()
    return sitemap_urls

def write_sitemap(sitemap_urls: List[Tuple[str, str]], output_file: str = "sitemap.xml") -> None:
    """Write the sitemap URLs to an XML file."""
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
            f.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n')
            for url, lastmod in sitemap_urls:
                f.write(f'  <url>\n')
                f.write(f'    <loc>{url}</loc>\n')
                f.write(f'    <lastmod>{lastmod}</lastmod>\n')
                f.write(f'  </url>\n')
            f.write('</urlset>')
        print(f"Sitemap generated successfully: {output_file}")
    except Exception as e:
        print(f"Error writing sitemap: {e}")

def main():
    parser = argparse.ArgumentParser(description="Generate a sitemap.xml for a website.")
    parser.add_argument("url", help="The base URL of the website to crawl.")
    parser.add_argument("--max-pages", type=int, default=100, help="Maximum number of pages to crawl.")
    parser.add_argument("--delay", type=float, default=1.0, help="Delay in seconds between requests.")
    parser.add_argument("--output", default="sitemap.xml", help="Output file name for the sitemap.")
    args = parser.parse_args()

    if not args.url.startswith("http"):
        print("Error: URL must start with http:// or https://")
        return

    print(f"Starting sitemap generation for: {args.url}")
    sitemap_urls = generate_sitemap_urls(args.url, args.max_pages, args.delay)
    write_sitemap(sitemap_urls, args.output)

if __name__ == "__main__":
    main()