WIP: add waybackurls scan (#56)

* fixed up config.defaults definition tools-dir and database-dir now use defaults.home value * added tool definition file; closes #54 * added basic PoC for waybackurls scanner; updated helpers.py test * added Endpoint/Target parsing; updated existing tests to pass * added tests for waybackurls * added WaybackurlsScan to FullScan * added documenation for WaybackurlsScan
2025-12-20 15:54:25 +01:00 · 2020-05-02 18:06:44 -07:00
parent 078fdaada7
commit f556319453
13 changed files with 212 additions and 13 deletions
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ The installer maintains a (naive) list of installed tools at `~/.local/recon-pip
 ## Defining a Scan's Scope
-**New in v0.9.0**: In the event you're scanning a single ip address or host, simply use `--target`.  It accepts a single target and works in conjunction with `--exempt-list` if specified.
+**New as of v0.9.0**: In the event you're scanning a single ip address or host, simply use `--target`.  It accepts a single target and works in conjunction with `--exempt-list` if specified.
 ```text
 scan HTBScan --target 10.10.10.183 --top-ports 1000
@@ -146,6 +146,7 @@ Scan the target
 [-] SearchsploitScan queued
 [-] ThreadedNmapScan queued
 [-] SubjackScan queued
 [-] WaybackurlsScan queued
 [-] AquatoneScan queued
 [-] GobusterScan queued
 [db-1] recon-pipeline>
--- a/docs/api/scanners.rst
+++ b/docs/api/scanners.rst
@@ -67,6 +67,13 @@ TKOSubs Scanner
 .. autoclass:: pipeline.recon.web.subdomain_takeover.TKOSubsScan
    :members:
 WaybackurlsScan Scanner
 #######################
 .. autoclass:: pipeline.recon.web.waybackurls.WaybackurlsScan
    :members:
 Webanalyze Scanner
 ##################
--- a/docs/overview/running_scans.rst
+++ b/docs/overview/running_scans.rst
@@ -15,6 +15,7 @@ following individual scans are available
 - :class:`pipeline.recon.web.subdomain_takeover.SubjackScan`
 - :class:`pipeline.recon.nmap.ThreadedNmapScan`
 - :class:`pipeline.recon.web.subdomain_takeover.TKOSubsScan`
 - :class:`pipeline.recon.web.waybackurls.WaybackurlsScan`
 - :class:`pipeline.recon.web.webanalyze.WebanalyzeScan`
 Additionally, two wrapper scans are made available.  These execute multiple scans in a pipeline.
@@ -49,6 +50,7 @@ Create a targetfile
    /root/PycharmProjects/recon-pipeline/pipeline/recon-pipeline.py
    recon-pipeline>
 **New as of v0.9.0**: In the event you're scanning a single ip address or host, simply use ``--target``.  It accepts a single target and works in conjunction with ``--exempt-list`` if specified.
 Create a new database to store scan results
@@ -79,6 +81,7 @@ Scan the target
    [-] WebanalyzeScan queued
    [-] SearchsploitScan queued
    [-] ThreadedNmapScan queued
    [-] WaybackurlsScan queued
    [-] SubjackScan queued
    [-] AquatoneScan queued
    [-] GobusterScan queued
--- a/pipeline/models/db_manager.py
+++ b/pipeline/models/db_manager.py
@@ -141,7 +141,7 @@ class DBManager:
    def get_status_codes(self):
        """ Simple helper that returns all status codes found during scanning """
-        return set(str(x[0]) for x in self.session.query(Endpoint.status_code).all())
+        return set(str(x[0]) for x in self.session.query(Endpoint.status_code).all() if x[0] is not None)
    def get_and_filter(self, model, defaults=None, **kwargs):
        """ Simple helper to either get an existing record if it exists otherwise create and return a new instance """
--- a/pipeline/recon-pipeline.py
+++ b/pipeline/recon-pipeline.py
@@ -256,7 +256,7 @@ class ReconShell(cmd2.Cmd):
        """ Scan something.
        Possible scans include
-            AmassScan           GobusterScan        SearchsploitScan
+            AmassScan           GobusterScan        SearchsploitScan    WaybackurlsScan
            ThreadedNmapScan    WebanalyzeScan      AquatoneScan        FullScan
            MasscanScan         SubjackScan         TKOSubsScan         HTBScan
        """
@@ -345,7 +345,6 @@ class ReconShell(cmd2.Cmd):
        if persistent_tool_dict.exists():
            tools = pickle.loads(persistent_tool_dict.read_bytes())
        print(args.tool)
        if tools.get(args.tool).get("dependencies"):
            # get all of the requested tools dependencies
@@ -605,7 +604,7 @@ class ReconShell(cmd2.Cmd):
        for endpoint in endpoints:
            color = color_map.get(str(endpoint.status_code)[0])
-            if args.plain:
+            if args.plain or endpoint.status_code is None:
                results.append(endpoint.url)
            else:
                results.append(f"[{style(endpoint.status_code, fg=color)}] {endpoint.url}")
--- a/pipeline/recon/config.py
+++ b/pipeline/recon/config.py
@@ -10,14 +10,13 @@ defaults = {
    "gobuster-extensions": "",
    "results-dir": "recon-results",
    "aquatone-scan-timeout": "900",
-    "tools-dir": f"{Path.home()}/.local/recon-pipeline/tools",
+    "home": Path.home(),
    "database-dir": f"{Path.home()}/.local/recon-pipeline/databases",
    "home": Path.home().expanduser().resolve(),
 }
 defaults["tools-dir"] = f"{defaults.get('home')}/.local/recon-pipeline/tools"
 defaults["database-dir"] = f"{defaults.get('home')}/.local/recon-pipeline/databases"
 defaults["gobuster-wordlist"] = f"{defaults.get('tools-dir')}/seclists/Discovery/Web-Content/common.txt"
 tool_paths = {
    "aquatone": f"{defaults.get('tools-dir')}/aquatone",
    "tko-subs": f"{Path.home()}/go/bin/tko-subs",
@@ -34,6 +33,7 @@ tool_paths = {
    "luigid": str(Path(__file__).parents[2] / "luigid.service"),
    "seclists": f"{defaults.get('tools-dir')}/seclists",
    "exploitdb": f"{defaults.get('tools-dir')}/exploitdb",
    "waybackurls": f"{Path.home()}/go/bin/waybackurls",
 }
 web_ports = {
--- a/pipeline/recon/web/init.py
+++ b/pipeline/recon/web/init.py
@@ -2,4 +2,5 @@ from .aquatone import AquatoneScan
 from .gobuster import GobusterScan
 from .targets import GatherWebTargets
 from .webanalyze import WebanalyzeScan
 from .waybackurls import WaybackurlsScan
 from .subdomain_takeover import SubjackScan, TKOSubsScan
--- a/pipeline/recon/web/waybackurls.py
+++ b/pipeline/recon/web/waybackurls.py
@@ -0,0 +1,117 @@
 import subprocess
 from pathlib import Path
 from urllib.parse import urlparse
 import luigi
 from luigi.util import inherits
 from luigi.contrib.sqla import SQLAlchemyTarget
 from .targets import GatherWebTargets
 from ...tools import tools
 from ...models.endpoint_model import Endpoint
 import pipeline.models.db_manager
@inherits(GatherWebTargets)
 class WaybackurlsScan(luigi.Task):
    """ Fetch known URLs from the Wayback Machine, Common Crawl, and Virus Total for historic data about the target.
    Install:
        .. code-block:: console
            go get github.com/tomnomnom/waybackurls
    Basic Example:
        ``waybackurls`` commands are structured like the example below.
        ``cat domains.txt | waybackurls > urls``
    Luigi Example:
        .. code-block:: python
            PYTHONPATH=$(pwd) luigi --local-scheduler --module recon.web.waybackurls WaybackurlsScan --target-file tesla --top-ports 1000
    Args:
        db_location: specifies the path to the database used for storing results *Required by upstream Task*
        exempt_list: Path to a file providing blacklisted subdomains, one per line. *Optional by upstream Task*
        top_ports: Scan top N most popular ports *Required by upstream Task*
        ports: specifies the port(s) to be scanned *Required by upstream Task*
        interface: use the named raw network interface, such as "eth0" *Required by upstream Task*
        rate: desired rate for transmitting packets (packets per second) *Required by upstream Task*
        target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task*
        results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task*
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.db_mgr = pipeline.models.db_manager.DBManager(db_location=self.db_location)
        self.results_subfolder = Path(self.results_dir) / "waybackurls-results"
    def requires(self):
        """ WaybackurlsScan depends on GatherWebTargets to run.
        GatherWebTargets accepts exempt_list and expects rate, target_file, interface,
                         and either ports or top_ports as parameters
        Returns:
            luigi.Task - GatherWebTargets
        """
        args = {
            "results_dir": self.results_dir,
            "rate": self.rate,
            "target_file": self.target_file,
            "top_ports": self.top_ports,
            "interface": self.interface,
            "ports": self.ports,
            "exempt_list": self.exempt_list,
            "db_location": self.db_location,
        }
        return GatherWebTargets(**args)
    def output(self):
        """ Returns the target output for this task.
        Returns:
            luigi.contrib.sqla.SQLAlchemyTarget
        """
        return SQLAlchemyTarget(
            connection_string=self.db_mgr.connection_string, target_table="endpoint", update_id=self.task_id
        )
    def run(self):
        """ Defines the options/arguments sent to waybackurls after processing. """
        self.results_subfolder.mkdir(parents=True, exist_ok=True)
        command = [tools.get("waybackurls").get("path")]
        waybackurls_input_file = self.results_subfolder / "input-from-webtargets"
        with open(waybackurls_input_file, "w") as f:
            for target in self.db_mgr.get_all_hostnames():
                f.write(f"{target}\n")
        with open(waybackurls_input_file) as target_list:
            proc = subprocess.run(command, stdin=target_list, stdout=subprocess.PIPE)
        for url in proc.stdout.decode().splitlines():
            if not url:
                continue
            parsed_url = urlparse(url)
            # get Target, may exist already or not
            ip_or_hostname = parsed_url.hostname
            tgt = self.db_mgr.get_or_create_target_by_ip_or_hostname(ip_or_hostname)
            endpoint = self.db_mgr.get_or_create(Endpoint, url=url, target=tgt)
            if endpoint not in tgt.endpoints:
                tgt.endpoints.append(endpoint)
            self.db_mgr.add(tgt)
            self.db_mgr.add(endpoint)
            self.output().touch()
        waybackurls_input_file.unlink()
--- a/pipeline/recon/wrappers.py
+++ b/pipeline/recon/wrappers.py
@@ -2,10 +2,7 @@ import luigi
 from luigi.util import inherits
 from .nmap import SearchsploitScan
-from .web import AquatoneScan
+from .web import AquatoneScan, GobusterScan, SubjackScan, TKOSubsScan, WaybackurlsScan, WebanalyzeScan
 from .web import GobusterScan
 from .web import WebanalyzeScan
 from .web import TKOSubsScan, SubjackScan
@inherits(SearchsploitScan, AquatoneScan, TKOSubsScan, SubjackScan, GobusterScan, WebanalyzeScan)
@@ -68,6 +65,7 @@ class FullScan(luigi.WrapperTask):
        del args["threads"]
        yield TKOSubsScan(**args)
        yield WaybackurlsScan(**args)
@inherits(SearchsploitScan, AquatoneScan, GobusterScan, WebanalyzeScan)
--- a/pipeline/tools/waybackurls.yaml
+++ b/pipeline/tools/waybackurls.yaml
@@ -0,0 +1,9 @@
 installed: false
 dependencies: ["go"]
 go: &gotool !get_tool_path "{go}"
 path: !join_path [!get_default "{home}", go, bin, waybackurls]
 commands:
 - !join [*gotool, get, github.com/tomnomnom/waybackurls]
 shell: false
--- a/tests/test_recon/test_helpers.py
+++ b/tests/test_recon/test_helpers.py
@@ -18,6 +18,7 @@ def test_get_scans():
        "SearchsploitScan",
        "ThreadedNmapScan",
        "WebanalyzeScan",
        "WaybackurlsScan",
    ]
    assert len(scan_names) == len(scans.keys())
--- a/tests/test_shell/test_recon_pipeline_shell.py
+++ b/tests/test_shell/test_recon_pipeline_shell.py
@@ -440,6 +440,12 @@ class TestReconShell:
                ],
                "shell": True,
            },
            "waybackurls": {
                "installed": True,
                "depencencies": ["go"],
                "commands": ["/usr/local/go/bin/go get github.com/tomnomnom/waybackurls"],
                "shell": True,
            },
        }
        tooldir = tmp_path / ".local" / "recon-pipeline" / "tools"
--- a/tests/test_web/test_waybackurls.py
+++ b/tests/test_web/test_waybackurls.py
@@ -0,0 +1,57 @@
 import shutil
 import tempfile
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 from pipeline.recon.web import WaybackurlsScan, GatherWebTargets
 class TestGatherWebTargets:
    def setup_method(self):
        self.tmp_path = Path(tempfile.mkdtemp())
        self.scan = WaybackurlsScan(
            target_file=__file__, results_dir=str(self.tmp_path), db_location=str(self.tmp_path / "testing.sqlite")
        )
    def teardown_method(self):
        shutil.rmtree(self.tmp_path)
    def test_scan_requires(self):
        with patch("pipeline.recon.web.GatherWebTargets"):
            retval = self.scan.requires()
            assert isinstance(retval, GatherWebTargets)
    def test_scan_creates_database(self):
        assert self.scan.db_mgr.location.exists()
        assert self.tmp_path / "testing.sqlite" == self.scan.db_mgr.location
    def test_scan_creates_results_dir(self):
        assert self.scan.results_subfolder == self.tmp_path / "waybackurls-results"
    def test_scan_run(self):
        with patch("subprocess.run", autospec=True) as mocked_run:
            self.scan.results_subfolder = self.tmp_path / "waybackurls-results"
            self.scan.db_mgr.get_all_hostnames = MagicMock()
            self.scan.db_mgr.get_all_hostnames.return_value = ["google.com"]
            completed_process_mock = MagicMock()
            completed_process_mock.stdout.return_value = b"https://drive.google.com\nhttps://maps.google.com\n\n"
            completed_process_mock.stdout.decode.return_value = "https://drive.google.com\nhttps://maps.google.com\n\n"
            completed_process_mock.stdout.decode.splitlines.return_value = [
                "https://drive.google.com",
                "https://maps.google.com",
            ]
            mocked_run.return_value = completed_process_mock
            self.scan.db_mgr.add = MagicMock()
            self.scan.db_mgr.get_or_create = MagicMock()
            self.scan.db_mgr.get_or_create_target_by_ip_or_hostname = MagicMock()
            self.scan.run()
            assert mocked_run.called
            assert self.scan.db_mgr.add.called
            assert self.scan.db_mgr.get_or_create.called
            assert self.scan.db_mgr.get_or_create_target_by_ip_or_hostname.called