import subprocess
from pathlib import Path
from urllib.parse import urlparse

import luigi
from luigi.util import inherits
from luigi.contrib.sqla import SQLAlchemyTarget

from .targets import GatherWebTargets
from ...tools import tools
from ...models.endpoint_model import Endpoint

import pipeline.models.db_manager


@inherits(GatherWebTargets)
class WaybackurlsScan(luigi.Task):
    """ Fetch known URLs from the Wayback Machine, Common Crawl, and Virus Total for historic data about the target.

    Install:
        .. code-block:: console

            go get github.com/tomnomnom/waybackurls

    Basic Example:
        ``waybackurls`` commands are structured like the example below.

        ``cat domains.txt | waybackurls > urls``

    Luigi Example:
        .. code-block:: python

            PYTHONPATH=$(pwd) luigi --local-scheduler --module recon.web.waybackurls WaybackurlsScan --target-file tesla --top-ports 1000

    Args:
        db_location: specifies the path to the database used for storing results *Required by upstream Task*
        exempt_list: Path to a file providing blacklisted subdomains, one per line. *Optional by upstream Task*
        top_ports: Scan top N most popular ports *Required by upstream Task*
        ports: specifies the port(s) to be scanned *Required by upstream Task*
        interface: use the named raw network interface, such as "eth0" *Required by upstream Task*
        rate: desired rate for transmitting packets (packets per second) *Required by upstream Task*
        target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task*
        results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task*
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.db_mgr = pipeline.models.db_manager.DBManager(db_location=self.db_location)
        self.results_subfolder = Path(self.results_dir) / "waybackurls-results"

    def requires(self):
        """ WaybackurlsScan depends on GatherWebTargets to run.

        GatherWebTargets accepts exempt_list and expects rate, target_file, interface,
                         and either ports or top_ports as parameters

        Returns:
            luigi.Task - GatherWebTargets
        """
        args = {
            "results_dir": self.results_dir,
            "rate": self.rate,
            "target_file": self.target_file,
            "top_ports": self.top_ports,
            "interface": self.interface,
            "ports": self.ports,
            "exempt_list": self.exempt_list,
            "db_location": self.db_location,
        }
        return GatherWebTargets(**args)

    def output(self):
        """ Returns the target output for this task.

        Returns:
            luigi.contrib.sqla.SQLAlchemyTarget
        """
        return SQLAlchemyTarget(
            connection_string=self.db_mgr.connection_string, target_table="endpoint", update_id=self.task_id
        )

    def run(self):
        """ Defines the options/arguments sent to waybackurls after processing. """
        self.results_subfolder.mkdir(parents=True, exist_ok=True)

        command = [tools.get("waybackurls").get("path")]

        waybackurls_input_file = self.results_subfolder / "input-from-webtargets"

        with open(waybackurls_input_file, "w") as f:
            for target in self.db_mgr.get_all_hostnames():
                f.write(f"{target}\n")

        with open(waybackurls_input_file) as target_list:
            proc = subprocess.run(command, stdin=target_list, stdout=subprocess.PIPE)

        for url in proc.stdout.decode().splitlines():
            if not url:
                continue

            parsed_url = urlparse(url)

            # get Target, may exist already or not
            ip_or_hostname = parsed_url.hostname
            tgt = self.db_mgr.get_or_create_target_by_ip_or_hostname(ip_or_hostname)

            endpoint = self.db_mgr.get_or_create(Endpoint, url=url, target=tgt)

            if endpoint not in tgt.endpoints:
                tgt.endpoints.append(endpoint)

            self.db_mgr.add(tgt)
            self.db_mgr.add(endpoint)

            self.output().touch()

        waybackurls_input_file.unlink()