recon-pipeline/pipeline/recon/amass.py

import json
import subprocess
from pathlib import Path

import luigi
from luigi.util import inherits
from luigi.contrib.sqla import SQLAlchemyTarget

import pipeline.models.db_manager
from .config import tool_paths
from .targets import TargetList
from ..models.target_model import Target


@inherits(TargetList)
class AmassScan(luigi.Task):
    """ Run ``amass`` scan to perform subdomain enumeration of given domain(s).

    Note:
        Expects **TARGET_FILE.domains** file to be a text file with one top-level domain per line.

    Install:
        .. code-block:: console

            sudo apt-get install -y -q amass

    Basic Example:
        .. code-block:: console

            amass enum -ip -brute -active -min-for-recursive 3 -df tesla -json amass.tesla.json

    Luigi Example:
        .. code-block:: console

            PYTHONPATH=$(pwd) luigi --local-scheduler --module recon.amass AmassScan --target-file tesla

    Args:
        exempt_list: Path to a file providing blacklisted subdomains, one per line.
        db_location: specifies the path to the database used for storing results *Required by upstream Task*
        target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task*
        results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task*
    """

    exempt_list = luigi.Parameter(default="")

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.db_mgr = pipeline.models.db_manager.DBManager(db_location=self.db_location)
        self.results_subfolder = (Path(self.results_dir) / "amass-results").expanduser().resolve()

    def requires(self):
        """ AmassScan depends on TargetList to run.

        TargetList expects target_file as a parameter.

        Returns:
            luigi.ExternalTask - TargetList
        """
        args = {"target_file": self.target_file, "results_dir": self.results_dir, "db_location": self.db_location}
        return TargetList(**args)

    def output(self):
        """ Returns the target output for this task.

        Naming convention for the output file is amass.json.

        Returns:
            luigi.local_target.LocalTarget
        """
        results_subfolder = Path(self.results_dir) / "amass-results"

        new_path = results_subfolder / "amass.json"

        return luigi.LocalTarget(new_path.expanduser().resolve())

    def run(self):
        """ Defines the options/arguments sent to amass after processing.

        Returns:
            list: list of options/arguments, beginning with the name of the executable to run
        """

        self.results_subfolder.mkdir(parents=True, exist_ok=True)

        hostnames = self.db_mgr.get_all_hostnames()

        if hostnames:
            # TargetList generated some domains for us to scan with amass
            amass_input_file = self.results_subfolder / "input-from-targetlist"
            with open(amass_input_file, "w") as f:
                for hostname in hostnames:
                    f.write(f"{hostname}\n")
        else:
            return subprocess.run(f"touch {self.output().path}".split())

        command = [
            f"{tool_paths.get('amass')}",
            "enum",
            "-active",
            "-ip",
            "-brute",
            "-min-for-recursive",
            "3",
            "-df",
            str(amass_input_file),
            "-json",
            self.output().path,
        ]

        if self.exempt_list:
            command.append("-blf")  # Path to a file providing blacklisted subdomains
            command.append(self.exempt_list)

        subprocess.run(command)

        amass_input_file.unlink()


@inherits(AmassScan)
class ParseAmassOutput(luigi.Task):
    """ Read amass JSON results and create categorized entries into ip|subdomain files.

    Args:
        db_location: specifies the path to the database used for storing results *Required by upstream Task*
        target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task*
        exempt_list: Path to a file providing blacklisted subdomains, one per line. *Optional by upstream Task*
        results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task*
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.db_mgr = pipeline.models.db_manager.DBManager(db_location=self.db_location)
        self.results_subfolder = (Path(self.results_dir) / "amass-results").expanduser().resolve()

    def requires(self):
        """ ParseAmassOutput depends on AmassScan to run.

        TargetList expects target_file as a parameter.
        AmassScan accepts exempt_list as an optional parameter.

        Returns:
            luigi.ExternalTask - TargetList
        """

        args = {
            "target_file": self.target_file,
            "exempt_list": self.exempt_list,
            "results_dir": self.results_dir,
            "db_location": self.db_location,
        }
        return AmassScan(**args)

    def output(self):
        """ Returns the target output files for this task.

        Returns:
            luigi.contrib.sqla.SQLAlchemyTarget
        """
        return SQLAlchemyTarget(
            connection_string=self.db_mgr.connection_string, target_table="target", update_id=self.task_id
        )

    def run(self):
        """ Parse the json file produced by AmassScan and categorize the results into ip|subdomain files.

        An example (prettified) entry from the json file is shown below
            {
              "Timestamp": "2019-09-22T19:20:13-05:00",
              "name": "beta-partners.tesla.com",
              "domain": "tesla.com",
              "addresses": [
                {
                  "ip": "209.133.79.58",
                  "cidr": "209.133.79.0/24",
                  "asn": 394161,
                  "desc": "TESLA - Tesla"
                }
              ],
              "tag": "ext",
              "source": "Previous Enum"
            }
        """
        self.results_subfolder.mkdir(parents=True, exist_ok=True)

        if Path(self.input().path).stat().st_size == 0:
            self.output().touch()
            return

        amass_json = self.input().open()

        with amass_json as amass_json_file:
            for line in amass_json_file:
                entry = json.loads(line)

                tgt = self.db_mgr.get_or_create(Target, hostname=entry.get("name"), is_web=True)

                for address in entry.get("addresses"):
                    ipaddr = address.get("ip")

                    tgt = self.db_mgr.add_ipv4_or_v6_address_to_target(tgt, ipaddr)

                self.db_mgr.add(tgt)
                self.output().touch()

            self.db_mgr.close()