recon-pipeline/pipeline/recon/web/targets.py

import luigi
from luigi.util import inherits
from luigi.contrib.sqla import SQLAlchemyTarget

import pipeline.models.db_manager
from ..config import web_ports
from ..amass import ParseAmassOutput
from ..masscan import ParseMasscanOutput


@inherits(ParseMasscanOutput)
class GatherWebTargets(luigi.Task):
    """ Gather all subdomains as well as any ip addresses known to have a configured web port open.

    Args:
        db_location: specifies the path to the database used for storing results *Required by upstream Task*
        exempt_list: Path to a file providing blacklisted subdomains, one per line. *Optional by upstream Task*
        top_ports: Scan top N most popular ports *Required by upstream Task*
        ports: specifies the port(s) to be scanned *Required by upstream Task*
        interface: use the named raw network interface, such as "eth0" *Required by upstream Task*
        rate: desired rate for transmitting packets (packets per second) *Required by upstream Task*
        target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task*
        results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task*
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.db_mgr = pipeline.models.db_manager.DBManager(db_location=self.db_location)

    def requires(self):
        """ GatherWebTargets depends on ParseMasscanOutput and ParseAmassOutput to run.

        ParseMasscanOutput expects rate, target_file, interface, and either ports or top_ports as parameters.
        ParseAmassOutput accepts exempt_list and expects target_file

        Returns:
            dict(str: ParseMasscanOutput, str: ParseAmassOutput)
        """
        args = {
            "results_dir": self.results_dir,
            "rate": self.rate,
            "target_file": self.target_file,
            "top_ports": self.top_ports,
            "interface": self.interface,
            "ports": self.ports,
            "db_location": self.db_location,
        }
        return {
            "masscan-output": ParseMasscanOutput(**args),
            "amass-output": ParseAmassOutput(
                exempt_list=self.exempt_list,
                target_file=self.target_file,
                results_dir=self.results_dir,
                db_location=self.db_location,
            ),
        }

    def output(self):
        """ Returns the target output for this task.

        Returns:
            luigi.contrib.sqla.SQLAlchemyTarget
        """
        return SQLAlchemyTarget(
            connection_string=self.db_mgr.connection_string, target_table="target", update_id=self.task_id
        )

    def run(self):
        """ Gather all potential web targets and tag them as web in the database. """

        for target in self.db_mgr.get_all_targets():
            ports = self.db_mgr.get_ports_by_ip_or_host_and_protocol(target, "tcp")
            if any(port in web_ports for port in ports):
                tgt = self.db_mgr.get_or_create_target_by_ip_or_hostname(target)
                tgt.is_web = True
                self.db_mgr.add(tgt)
                self.output().touch()

        # in the event that there are no web ports for any target, we still want to be able to mark the
        # task complete successfully.  we accomplish this by calling .touch() even though a database entry
        # may not have happened
        self.output().touch()

        self.db_mgr.close()