import os
import re
import time
import requests
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlsplit
from docker_logic import create_docker_dir, create_docker_compose, create_nginx_docker

def create_project(dir_name):
    try:
        if os.path.exists(dir_name):
            print(f'Папка с именем {dir_name} существует!')
        else:
            print(dir_name)
            path_string = os.path.join(dir_name, 'public')
            os.makedirs(path_string)

            if os.path.exists(path_string):
                print(f'Проект {dir_name} успешно создан в папке {path_string}')
            else:
                print(f'Не удалось создать проект {dir_name} в папке {path_string}')
    except OSError as error:
        print(f'Ошибка при создании проекта {dir_name}: {error}')

def process_styles(soup, wdriver, main_link, dir):
    link_tags = soup.find_all("link", rel="stylesheet")
    for link in link_tags:
        href = link.get('href')
        data_href = link.get('data-href')

        if (href and href.startswith('https://fonts')) or (data_href and data_href.startswith('https://fonts')):
            continue
        elif (href and (href.startswith('/') or href.startswith('.'))) or (data_href and (data_href.startswith('/') or data_href.startswith('.'))):
            css_url = main_link + (href or data_href)
        elif (href and href.startswith('https')) or (data_href and data_href.startswith('https')):
            css_url = href or data_href
        else:
            css_url = main_link + '/' + (href or data_href)

        try:
            response = requests.get(css_url)
            response.raise_for_status()
            css_content = response.text
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch {css_url}: {e}")
            continue

        file_name_with_ext = os.path.basename(link['href'])
        file_name_with_ext = re.sub(r'[<>:"\\/|?*]', '_', file_name_with_ext)
        folder_string = str(link["href"]).rsplit("/", 1)[0]
        folder_string = re.sub(r'[<>:"\\/|?*]', '_', folder_string)

        if folder_string.startswith('/') or folder_string.startswith('.'):
            folder_path = os.path.join(dir, 'public', folder_string.lstrip('/'))
        else:
            folder_path = os.path.join(dir, 'public', folder_string)

        try:
            os.makedirs(folder_path, exist_ok=True)
        except OSError as e:
            print(f"Failed to create directory {folder_path}: {e}")
            continue

        try:
            with open(os.path.join(folder_path, file_name_with_ext), "w", encoding="utf-8") as css_file:
                css_file.write(css_content)
        except OSError as e:
            print(f"Failed to write file {os.path.join(folder_path, file_name_with_ext)}: {e}")

    styles = wdriver.execute_script(
        """
        const styleSheets = Array.from(document.styleSheets);
        const cssRulesList = styleSheets
          .filter(sheet => !sheet.href || sheet.href.startsWith(window.location.origin))
          .flatMap(sheet => {
            try {
              return Array.from(sheet.cssRules) || [];
            } catch (e) {
              return [];
            }
          });

        return cssRulesList.map(rule => rule.cssText).join("\\n");
        """
    )

    path_list = [dir, 'public', "static", "css"]
    path_string = os.path.join(*path_list)
    if not os.path.exists(path_string):
        os.makedirs(path_string)

    with open(f"{dir}/public/static/css/main.css", "w", encoding="utf-8") as css_file:
        css_file.write(styles)
    return soup

def process_images(soup, main_link, dir):
    try:
        for img in soup.find_all('img'):
            img_src = img['src']
            if not str(img_src).startswith('data:image') and not str(img_src).startswith('https'):
                if str(img_src).startswith('/'):
                    img_full_link = f'{main_link}{img_src}'
                else:
                    img_full_link = f'{main_link}/{img_src}'

                response = requests.get(img_full_link)
                path_without_file = os.path.dirname(img_src)
                os.makedirs(f'{dir}/public/{path_without_file}', exist_ok=True)
                file_name = str(img_src.split("/")[-1])
                output_file_name = re.sub(r'[<>:"\\/|?*]', '_', file_name)

                with open(f'{dir}/public{path_without_file}/{output_file_name}', 'wb') as file:
                    file.write(response.content)

            elif not str(img_src).startswith('data:image') and str(img_src).startswith('https'):
                response = requests.get(img_src)
                parsed_url = urlparse(img_src)
                path_without_file = os.path.dirname(parsed_url.path)
                os.makedirs(f'{dir}/public/{path_without_file}', exist_ok=True)
                file_name = str(img_src.split("/")[-1])
                output_file_name = re.sub(r'[<>:"\\/|?*]', '_', file_name)

                with open(f'{dir}/public{path_without_file}/{output_file_name}', 'wb') as file:
                    file.write(response.content)
    except Exception as e:
        print(f"Error processing images: {e}")
        time.sleep(15)

    return soup

def process_scripts(soup, main_link, dir):
    script_tags = soup.find_all("script")
    for script in script_tags:
        src = script.get('src')
        if src:
            if src.startswith(('http://', 'https://')):
                script_url = src
            elif src.startswith('/'):
                script_url = urljoin(main_link, src)
            else:
                script_url = main_link + '/' + src

            if script_url.endswith('.pdf'):
                continue

            try:
                response = requests.get(script_url)
                response.raise_for_status()
                js_content = response.text
            except requests.exceptions.RequestException as e:
                print(f"Failed to fetch {script_url}: {e}")
                continue

            file_name_with_ext = os.path.basename(script_url)
            file_name_with_ext = re.sub(r'[<>:"\\/|?*]', '_', file_name_with_ext)

            folder_string = os.path.dirname(src)
            folder_string = re.sub(r'[<>:"\\/|?*]', '_', folder_string)

            folder_path = os.path.join(dir, 'public', folder_string.lstrip('/'))
            try:
                os.makedirs(folder_path, exist_ok=True)
            except OSError as e:
                print(f"Failed to create directory {folder_path}: {e}")
                continue

            try:
                with open(os.path.join(folder_path, file_name_with_ext), "w", encoding="utf-8") as js_file:
                    js_file.write(js_content)
            except OSError as e:
                print(f"Failed to write file {os.path.join(folder_path, file_name_with_ext)}: {e}")

    return soup

def save_soup_to_file(soup, dir, page):
    with open(f"{dir}/public/{page}.html", "w", encoding="utf-8") as file:
        file.write(str(soup.prettify()))

def get_all_links(soup, base_url):
    links = set()
    for tag in soup.find_all("a", href=True):
        href = tag.get("href")
        
        if href.lower().endswith(".pdf"):
            continue

        if href.startswith("#"):
            href = f"{base_url}/{href}"

        elif href.startswith("/"):
            href = urljoin(base_url, href)

        parsed_href = urlparse(href)
        if base_url in href or not parsed_href.netloc:
            links.add(parsed_href.geturl())
    
    return links

def sanitize_filename(filename):
    return re.sub(r'[<>:"/\\|?*]', '_', filename)

def get_page(link, dir, page, main_link):
    if '?' in page:
        print(f"Skipping page with parameters: {page}")
        return None

    sanitized_page = sanitize_filename(page)

    page_path = f"{dir}/public/{sanitized_page}.html"
    if os.path.exists(page_path):
        print(f"Page {sanitized_page} already exists. Skipping...")
        return None

    try:
        wdriver = webdriver.Firefox()
        wdriver.get(link)
        time.sleep(10)
        content = wdriver.page_source
        soup = BeautifulSoup(content, "html.parser")
        process_styles(soup, wdriver, main_link, dir)
        process_images(soup, main_link, dir)
        process_scripts(soup, main_link, dir)
        save_soup_to_file(soup, dir, sanitized_page)
        print(f"Page {sanitized_page} saved successfully.")
        wdriver.quit()

        return soup

    except WebDriverException as e:
        print(f"Error processing page {page}: {e}")
        if 'wdriver' in locals():
            wdriver.quit()
        return None

def crawl_site(start_url, dir, main_link):
    visited = set()
    to_visit = [start_url]

    while to_visit:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue

        print(f"Visiting: {current_url}")
        page_name = current_url.replace(main_link, "").strip("/").replace("/", "_")
        soup = get_page(current_url, dir, page_name, main_link)
        
        if soup:
            links = get_all_links(soup, main_link)
            for link in links:
                if link not in visited:
                    to_visit.append(link)

        visited.add(current_url)

    print("Копирование сайта завершено.")

def main():
    web_link = input('Введите ссылку на страницу: ')
    parsed_web_link = urlsplit(web_link)
    main_link = f"{parsed_web_link.scheme}://{parsed_web_link.netloc}"
    if parsed_web_link.netloc == '':
        path = parsed_web_link.path.strip('/')
        link = path
    else:
        link = parsed_web_link.netloc

    create_project(link)
    create_docker_dir(link)
    create_docker_compose(link, parsed_web_link.netloc)
    create_nginx_docker(link, "index")

    crawl_site(web_link, link, main_link)
    
if __name__ == '__main__':
    main()
