recon-pipeline/pipeline/recon/masscan.py

import json
import logging
import subprocess
from pathlib import Path

import luigi
from luigi.util import inherits
from luigi.contrib.sqla import SQLAlchemyTarget

import pipeline.models.db_manager
from .targets import TargetList
from .amass import ParseAmassOutput
from ..models.port_model import Port
from ..models.ip_address_model import IPAddress

from .config import top_tcp_ports, top_udp_ports, defaults, tool_paths, web_ports


@inherits(TargetList, ParseAmassOutput)
class MasscanScan(luigi.Task):
    """ Run ``masscan`` against a target specified via the TargetList Task.

    Note:
        When specified, ``--top_ports`` is processed and then ultimately passed to ``--ports``.

    Install:
        .. code-block:: console

            git clone https://github.com/robertdavidgraham/masscan /tmp/masscan
            make -s -j -C /tmp/masscan
            sudo mv /tmp/masscan/bin/masscan /usr/local/bin/masscan
            rm -rf /tmp/masscan

    Basic Example:
        .. code-block:: console

            masscan -v --open-only --banners --rate 1000 -e tun0 -oJ masscan.tesla.json --ports 80,443,22,21 -iL tesla.ips

    Luigi Example:
        .. code-block:: console

            PYTHONPATH=$(pwd) luigi --local-scheduler --module recon.masscan Masscan --target-file tesla --ports 80,443,22,21

    Args:
        rate: desired rate for transmitting packets (packets per second)
        interface: use the named raw network interface, such as "eth0"
        top_ports: Scan top N most popular ports
        ports: specifies the port(s) to be scanned
        db_location: specifies the path to the database used for storing results *Required by upstream Task*
        target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task*
        results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task*
        exempt_list: Path to a file providing blacklisted subdomains, one per line. *Optional by upstream Task*
    """

    rate = luigi.Parameter(default=defaults.get("masscan-rate"))
    interface = luigi.Parameter(default=defaults.get("masscan-iface"))
    top_ports = luigi.IntParameter(default=0)  # IntParameter -> top_ports expected as int
    ports = luigi.Parameter(default="")

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.db_mgr = pipeline.models.db_manager.DBManager(db_location=self.db_location)
        self.results_subfolder = (Path(self.results_dir) / "masscan-results").expanduser().resolve()

    def output(self):
        """ Returns the target output for this task.

        Naming convention for the output file is masscan.TARGET_FILE.json.

        Returns:
            luigi.local_target.LocalTarget
        """
        new_path = self.results_subfolder / "masscan.json"

        return luigi.LocalTarget(new_path.expanduser().resolve())

    def run(self):
        """ Defines the options/arguments sent to masscan after processing.

        Returns:
            list: list of options/arguments, beginning with the name of the executable to run
        """
        if not self.ports and not self.top_ports:
            # need at least one, can't be put into argparse scanner because things like amass don't require ports option
            logging.error("Must specify either --top-ports or --ports.")
            exit(2)

        if self.top_ports:
            # if --top-ports used, format the top_*_ports lists as strings and then into a proper masscan --ports option
            top_tcp_ports_str = ",".join(str(x) for x in top_tcp_ports[: self.top_ports])
            top_udp_ports_str = ",".join(str(x) for x in top_udp_ports[: self.top_ports])

            self.ports = f"{top_tcp_ports_str},U:{top_udp_ports_str}"
            self.top_ports = 0

        self.results_subfolder.mkdir(parents=True, exist_ok=True)

        yield TargetList(target_file=self.target_file, results_dir=self.results_dir, db_location=self.db_location)

        if self.db_mgr.get_all_hostnames():
            # TargetList generated some domains for us to scan with amass

            yield ParseAmassOutput(
                target_file=self.target_file,
                exempt_list=self.exempt_list,
                results_dir=self.results_dir,
                db_location=self.db_location,
            )

        command = [
            tool_paths.get("masscan"),
            "-v",
            "--open",
            "--banners",
            "--rate",
            self.rate,
            "-e",
            self.interface,
            "-oJ",
            self.output().path,
            "--ports",
            self.ports,
            "-iL",
        ]

        # masscan only understands how to scan ipv4
        ip_addresses = self.db_mgr.get_all_ipv4_addresses()
        masscan_input_file = None

        if ip_addresses:
            # TargetList generated ip addresses for us to scan with masscan
            masscan_input_file = self.results_subfolder / "input-from-amass"

            with open(masscan_input_file, "w") as f:
                for ip_address in ip_addresses:
                    f.write(f"{ip_address}\n")

            command.append(str(masscan_input_file))

        subprocess.run(command)  # will fail if no ipv4 addresses were found

        if masscan_input_file is not None:
            masscan_input_file.unlink()


@inherits(MasscanScan)
class ParseMasscanOutput(luigi.Task):
    """ Read masscan JSON results and create a pickled dictionary of pertinent information for processing.

    Args:
        top_ports: Scan top N most popular ports *Required by upstream Task*
        ports: specifies the port(s) to be scanned *Required by upstream Task*
        interface: use the named raw network interface, such as "eth0" *Required by upstream Task*
        rate: desired rate for transmitting packets (packets per second) *Required by upstream Task*
        db_location: specifies the path to the database used for storing results *Required by upstream Task*
        target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task*
        results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task*
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.db_mgr = pipeline.models.db_manager.DBManager(db_location=self.db_location)
        self.results_subfolder = (Path(self.results_dir) / "masscan-results").expanduser().resolve()

    def requires(self):
        """ ParseMasscanOutput depends on Masscan to run.

        Masscan expects rate, target_file, interface, and either ports or top_ports as parameters.

        Returns:
            luigi.Task - Masscan
        """
        args = {
            "results_dir": self.results_dir,
            "rate": self.rate,
            "target_file": self.target_file,
            "top_ports": self.top_ports,
            "interface": self.interface,
            "ports": self.ports,
            "db_location": self.db_location,
        }
        return MasscanScan(**args)

    def output(self):
        """ Returns the target output for this task.

        Naming convention for the output file is masscan.TARGET_FILE.parsed.pickle.

        Returns:
            luigi.local_target.LocalTarget
        """
        return SQLAlchemyTarget(
            connection_string=self.db_mgr.connection_string, target_table="port", update_id=self.task_id
        )

    def run(self):
        """ Reads masscan JSON results and creates a pickled dictionary of pertinent information for processing. """
        try:
            # load masscan results from Masscan Task
            entries = json.load(self.input().open())
        except json.decoder.JSONDecodeError as e:
            # return on exception; no output file created; pipeline should start again from
            # this task if restarted because we never hit pickle.dump
            return print(e)

        self.results_subfolder.mkdir(parents=True, exist_ok=True)

        """
        populate database from the loaded JSON

        masscan JSON structure over which we're looping
        [
        {   "ip": "10.10.10.146",   "timestamp": "1567856130", "ports": [ {"port": 22, "proto": "tcp", "status": "open", "reason": "syn-ack", "ttl": 63} ] }
        ,
        {   "ip": "10.10.10.146",   "timestamp": "1567856130", "ports": [ {"port": 80, "proto": "tcp", "status": "open", "reason": "syn-ack", "ttl": 63} ] }
        ]
        """

        for entry in entries:
            single_target_ip = entry.get("ip")

            tgt = self.db_mgr.get_or_create_target_by_ip_or_hostname(single_target_ip)

            if single_target_ip not in tgt.ip_addresses:
                tgt.ip_addresses.append(self.db_mgr.get_or_create(IPAddress, ipv4_address=single_target_ip))

            for port_entry in entry.get("ports"):
                protocol = port_entry.get("proto")

                port = self.db_mgr.get_or_create(Port, protocol=protocol, port_number=port_entry.get("port"))

                if str(port.port_number) in web_ports:
                    tgt.is_web = True

                tgt.open_ports.append(port)

            self.db_mgr.add(tgt)
            self.output().touch()

        self.db_mgr.close()