recon-pipeline/recon/web/targets.py

import pickle
from pathlib import Path

import luigi
from luigi.util import inherits

from recon.amass import ParseAmassOutput
from recon.masscan import ParseMasscanOutput
from recon.config import web_ports


@inherits(ParseMasscanOutput, ParseAmassOutput)
class GatherWebTargets(luigi.Task):
    """ Gather all subdomains as well as any ip addresses known to have a configured web port open.

    Args:
        exempt_list: Path to a file providing blacklisted subdomains, one per line. *Optional by upstream Task*
        top_ports: Scan top N most popular ports *Required by upstream Task*
        ports: specifies the port(s) to be scanned *Required by upstream Task*
        interface: use the named raw network interface, such as "eth0" *Required by upstream Task*
        rate: desired rate for transmitting packets (packets per second) *Required by upstream Task*
        target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task*
        results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task*
    """

    def requires(self):
        """ GatherWebTargets depends on ParseMasscanOutput and ParseAmassOutput to run.

        ParseMasscanOutput expects rate, target_file, interface, and either ports or top_ports as parameters.
        ParseAmassOutput accepts exempt_list and expects target_file

        Returns:
            dict(str: ParseMasscanOutput, str: ParseAmassOutput)
        """
        args = {
            "results_dir": self.results_dir,
            "rate": self.rate,
            "target_file": self.target_file,
            "top_ports": self.top_ports,
            "interface": self.interface,
            "ports": self.ports,
        }
        return {
            "masscan-output": ParseMasscanOutput(**args),
            "amass-output": ParseAmassOutput(
                exempt_list=self.exempt_list, target_file=self.target_file, results_dir=self.results_dir,
            ),
        }

    def output(self):
        """ Returns the target output for this task.

        Naming convention for the output file is webtargets.TARGET_FILE.txt.

        Returns:
            luigi.local_target.LocalTarget
        """
        results_subfolder = Path(self.results_dir) / "target-results"

        new_path = results_subfolder / "webtargets.txt"

        return luigi.LocalTarget(new_path.resolve())

    def run(self):
        """ Gather all potential web targets into a single file to pass farther down the pipeline. """
        Path(self.output().path).parent.mkdir(parents=True, exist_ok=True)

        targets = set()

        ip_dict = pickle.load(open(self.input().get("masscan-output").path, "rb"))

        """
        structure over which we're looping
        {
            "IP_ADDRESS":
                {'udp': {"161", "5000", ... },
                ...
                i.e. {protocol: set(ports) }
        }
        """
        for target, protocol_dict in ip_dict.items():
            for protocol, ports in protocol_dict.items():
                for port in ports:
                    if protocol == "udp":
                        continue
                    if port == "80":
                        targets.add(target)
                    elif port in web_ports:
                        targets.add(f"{target}:{port}")

        for amass_result in self.input().get("amass-output").values():
            with amass_result.open() as f:
                for target in f:
                    # we care about all results returned from amass
                    targets.add(target.strip())

        with self.output().open("w") as f:
            for target in targets:
                f.write(f"{target}\n")