From f556319453e361cd8896156fa660046c41f2ac96 Mon Sep 17 00:00:00 2001 From: epi052 <43392618+epi052@users.noreply.github.com> Date: Sat, 2 May 2020 18:06:44 -0700 Subject: [PATCH] WIP: add waybackurls scan (#56) * fixed up config.defaults definition tools-dir and database-dir now use defaults.home value * added tool definition file; closes #54 * added basic PoC for waybackurls scanner; updated helpers.py test * added Endpoint/Target parsing; updated existing tests to pass * added tests for waybackurls * added WaybackurlsScan to FullScan * added documenation for WaybackurlsScan --- README.md | 3 +- docs/api/scanners.rst | 7 ++ docs/overview/running_scans.rst | 3 + pipeline/models/db_manager.py | 2 +- pipeline/recon-pipeline.py | 5 +- pipeline/recon/config.py | 8 +- pipeline/recon/web/__init__.py | 1 + pipeline/recon/web/waybackurls.py | 117 ++++++++++++++++++ pipeline/recon/wrappers.py | 6 +- pipeline/tools/waybackurls.yaml | 9 ++ tests/test_recon/test_helpers.py | 1 + tests/test_shell/test_recon_pipeline_shell.py | 6 + tests/test_web/test_waybackurls.py | 57 +++++++++ 13 files changed, 212 insertions(+), 13 deletions(-) create mode 100644 pipeline/recon/web/waybackurls.py create mode 100644 pipeline/tools/waybackurls.yaml create mode 100644 tests/test_web/test_waybackurls.py diff --git a/README.md b/README.md index 895768c..8dcc1c9 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ The installer maintains a (naive) list of installed tools at `~/.local/recon-pip ## Defining a Scan's Scope -**New in v0.9.0**: In the event you're scanning a single ip address or host, simply use `--target`. It accepts a single target and works in conjunction with `--exempt-list` if specified. +**New as of v0.9.0**: In the event you're scanning a single ip address or host, simply use `--target`. It accepts a single target and works in conjunction with `--exempt-list` if specified. ```text scan HTBScan --target 10.10.10.183 --top-ports 1000 @@ -146,6 +146,7 @@ Scan the target [-] SearchsploitScan queued [-] ThreadedNmapScan queued [-] SubjackScan queued +[-] WaybackurlsScan queued [-] AquatoneScan queued [-] GobusterScan queued [db-1] recon-pipeline> diff --git a/docs/api/scanners.rst b/docs/api/scanners.rst index eeda82c..37a6ea9 100644 --- a/docs/api/scanners.rst +++ b/docs/api/scanners.rst @@ -67,6 +67,13 @@ TKOSubs Scanner .. autoclass:: pipeline.recon.web.subdomain_takeover.TKOSubsScan :members: +WaybackurlsScan Scanner +####################### + +.. autoclass:: pipeline.recon.web.waybackurls.WaybackurlsScan + :members: + + Webanalyze Scanner ################## diff --git a/docs/overview/running_scans.rst b/docs/overview/running_scans.rst index bd01a45..188c134 100644 --- a/docs/overview/running_scans.rst +++ b/docs/overview/running_scans.rst @@ -15,6 +15,7 @@ following individual scans are available - :class:`pipeline.recon.web.subdomain_takeover.SubjackScan` - :class:`pipeline.recon.nmap.ThreadedNmapScan` - :class:`pipeline.recon.web.subdomain_takeover.TKOSubsScan` +- :class:`pipeline.recon.web.waybackurls.WaybackurlsScan` - :class:`pipeline.recon.web.webanalyze.WebanalyzeScan` Additionally, two wrapper scans are made available. These execute multiple scans in a pipeline. @@ -49,6 +50,7 @@ Create a targetfile /root/PycharmProjects/recon-pipeline/pipeline/recon-pipeline.py recon-pipeline> +**New as of v0.9.0**: In the event you're scanning a single ip address or host, simply use ``--target``. It accepts a single target and works in conjunction with ``--exempt-list`` if specified. Create a new database to store scan results @@ -79,6 +81,7 @@ Scan the target [-] WebanalyzeScan queued [-] SearchsploitScan queued [-] ThreadedNmapScan queued + [-] WaybackurlsScan queued [-] SubjackScan queued [-] AquatoneScan queued [-] GobusterScan queued diff --git a/pipeline/models/db_manager.py b/pipeline/models/db_manager.py index 5b409d5..46abd98 100644 --- a/pipeline/models/db_manager.py +++ b/pipeline/models/db_manager.py @@ -141,7 +141,7 @@ class DBManager: def get_status_codes(self): """ Simple helper that returns all status codes found during scanning """ - return set(str(x[0]) for x in self.session.query(Endpoint.status_code).all()) + return set(str(x[0]) for x in self.session.query(Endpoint.status_code).all() if x[0] is not None) def get_and_filter(self, model, defaults=None, **kwargs): """ Simple helper to either get an existing record if it exists otherwise create and return a new instance """ diff --git a/pipeline/recon-pipeline.py b/pipeline/recon-pipeline.py index 963b329..0de90a7 100755 --- a/pipeline/recon-pipeline.py +++ b/pipeline/recon-pipeline.py @@ -256,7 +256,7 @@ class ReconShell(cmd2.Cmd): """ Scan something. Possible scans include - AmassScan GobusterScan SearchsploitScan + AmassScan GobusterScan SearchsploitScan WaybackurlsScan ThreadedNmapScan WebanalyzeScan AquatoneScan FullScan MasscanScan SubjackScan TKOSubsScan HTBScan """ @@ -345,7 +345,6 @@ class ReconShell(cmd2.Cmd): if persistent_tool_dict.exists(): tools = pickle.loads(persistent_tool_dict.read_bytes()) - print(args.tool) if tools.get(args.tool).get("dependencies"): # get all of the requested tools dependencies @@ -605,7 +604,7 @@ class ReconShell(cmd2.Cmd): for endpoint in endpoints: color = color_map.get(str(endpoint.status_code)[0]) - if args.plain: + if args.plain or endpoint.status_code is None: results.append(endpoint.url) else: results.append(f"[{style(endpoint.status_code, fg=color)}] {endpoint.url}") diff --git a/pipeline/recon/config.py b/pipeline/recon/config.py index 93351a7..a8b8543 100644 --- a/pipeline/recon/config.py +++ b/pipeline/recon/config.py @@ -10,14 +10,13 @@ defaults = { "gobuster-extensions": "", "results-dir": "recon-results", "aquatone-scan-timeout": "900", - "tools-dir": f"{Path.home()}/.local/recon-pipeline/tools", - "database-dir": f"{Path.home()}/.local/recon-pipeline/databases", - "home": Path.home().expanduser().resolve(), + "home": Path.home(), } +defaults["tools-dir"] = f"{defaults.get('home')}/.local/recon-pipeline/tools" +defaults["database-dir"] = f"{defaults.get('home')}/.local/recon-pipeline/databases" defaults["gobuster-wordlist"] = f"{defaults.get('tools-dir')}/seclists/Discovery/Web-Content/common.txt" - tool_paths = { "aquatone": f"{defaults.get('tools-dir')}/aquatone", "tko-subs": f"{Path.home()}/go/bin/tko-subs", @@ -34,6 +33,7 @@ tool_paths = { "luigid": str(Path(__file__).parents[2] / "luigid.service"), "seclists": f"{defaults.get('tools-dir')}/seclists", "exploitdb": f"{defaults.get('tools-dir')}/exploitdb", + "waybackurls": f"{Path.home()}/go/bin/waybackurls", } web_ports = { diff --git a/pipeline/recon/web/__init__.py b/pipeline/recon/web/__init__.py index ec64049..96d0920 100644 --- a/pipeline/recon/web/__init__.py +++ b/pipeline/recon/web/__init__.py @@ -2,4 +2,5 @@ from .aquatone import AquatoneScan from .gobuster import GobusterScan from .targets import GatherWebTargets from .webanalyze import WebanalyzeScan +from .waybackurls import WaybackurlsScan from .subdomain_takeover import SubjackScan, TKOSubsScan diff --git a/pipeline/recon/web/waybackurls.py b/pipeline/recon/web/waybackurls.py new file mode 100644 index 0000000..f09db7c --- /dev/null +++ b/pipeline/recon/web/waybackurls.py @@ -0,0 +1,117 @@ +import subprocess +from pathlib import Path +from urllib.parse import urlparse + +import luigi +from luigi.util import inherits +from luigi.contrib.sqla import SQLAlchemyTarget + +from .targets import GatherWebTargets +from ...tools import tools +from ...models.endpoint_model import Endpoint + +import pipeline.models.db_manager + + +@inherits(GatherWebTargets) +class WaybackurlsScan(luigi.Task): + """ Fetch known URLs from the Wayback Machine, Common Crawl, and Virus Total for historic data about the target. + + Install: + .. code-block:: console + + go get github.com/tomnomnom/waybackurls + + Basic Example: + ``waybackurls`` commands are structured like the example below. + + ``cat domains.txt | waybackurls > urls`` + + Luigi Example: + .. code-block:: python + + PYTHONPATH=$(pwd) luigi --local-scheduler --module recon.web.waybackurls WaybackurlsScan --target-file tesla --top-ports 1000 + + Args: + db_location: specifies the path to the database used for storing results *Required by upstream Task* + exempt_list: Path to a file providing blacklisted subdomains, one per line. *Optional by upstream Task* + top_ports: Scan top N most popular ports *Required by upstream Task* + ports: specifies the port(s) to be scanned *Required by upstream Task* + interface: use the named raw network interface, such as "eth0" *Required by upstream Task* + rate: desired rate for transmitting packets (packets per second) *Required by upstream Task* + target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task* + results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task* + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.db_mgr = pipeline.models.db_manager.DBManager(db_location=self.db_location) + self.results_subfolder = Path(self.results_dir) / "waybackurls-results" + + def requires(self): + """ WaybackurlsScan depends on GatherWebTargets to run. + + GatherWebTargets accepts exempt_list and expects rate, target_file, interface, + and either ports or top_ports as parameters + + Returns: + luigi.Task - GatherWebTargets + """ + args = { + "results_dir": self.results_dir, + "rate": self.rate, + "target_file": self.target_file, + "top_ports": self.top_ports, + "interface": self.interface, + "ports": self.ports, + "exempt_list": self.exempt_list, + "db_location": self.db_location, + } + return GatherWebTargets(**args) + + def output(self): + """ Returns the target output for this task. + + Returns: + luigi.contrib.sqla.SQLAlchemyTarget + """ + return SQLAlchemyTarget( + connection_string=self.db_mgr.connection_string, target_table="endpoint", update_id=self.task_id + ) + + def run(self): + """ Defines the options/arguments sent to waybackurls after processing. """ + self.results_subfolder.mkdir(parents=True, exist_ok=True) + + command = [tools.get("waybackurls").get("path")] + + waybackurls_input_file = self.results_subfolder / "input-from-webtargets" + + with open(waybackurls_input_file, "w") as f: + for target in self.db_mgr.get_all_hostnames(): + f.write(f"{target}\n") + + with open(waybackurls_input_file) as target_list: + proc = subprocess.run(command, stdin=target_list, stdout=subprocess.PIPE) + + for url in proc.stdout.decode().splitlines(): + if not url: + continue + + parsed_url = urlparse(url) + + # get Target, may exist already or not + ip_or_hostname = parsed_url.hostname + tgt = self.db_mgr.get_or_create_target_by_ip_or_hostname(ip_or_hostname) + + endpoint = self.db_mgr.get_or_create(Endpoint, url=url, target=tgt) + + if endpoint not in tgt.endpoints: + tgt.endpoints.append(endpoint) + + self.db_mgr.add(tgt) + self.db_mgr.add(endpoint) + + self.output().touch() + + waybackurls_input_file.unlink() diff --git a/pipeline/recon/wrappers.py b/pipeline/recon/wrappers.py index 3401dc0..327fbd3 100644 --- a/pipeline/recon/wrappers.py +++ b/pipeline/recon/wrappers.py @@ -2,10 +2,7 @@ import luigi from luigi.util import inherits from .nmap import SearchsploitScan -from .web import AquatoneScan -from .web import GobusterScan -from .web import WebanalyzeScan -from .web import TKOSubsScan, SubjackScan +from .web import AquatoneScan, GobusterScan, SubjackScan, TKOSubsScan, WaybackurlsScan, WebanalyzeScan @inherits(SearchsploitScan, AquatoneScan, TKOSubsScan, SubjackScan, GobusterScan, WebanalyzeScan) @@ -68,6 +65,7 @@ class FullScan(luigi.WrapperTask): del args["threads"] yield TKOSubsScan(**args) + yield WaybackurlsScan(**args) @inherits(SearchsploitScan, AquatoneScan, GobusterScan, WebanalyzeScan) diff --git a/pipeline/tools/waybackurls.yaml b/pipeline/tools/waybackurls.yaml new file mode 100644 index 0000000..7badb9f --- /dev/null +++ b/pipeline/tools/waybackurls.yaml @@ -0,0 +1,9 @@ +installed: false +dependencies: ["go"] +go: &gotool !get_tool_path "{go}" +path: !join_path [!get_default "{home}", go, bin, waybackurls] + +commands: +- !join [*gotool, get, github.com/tomnomnom/waybackurls] + +shell: false diff --git a/tests/test_recon/test_helpers.py b/tests/test_recon/test_helpers.py index 5a19889..5e7aee4 100644 --- a/tests/test_recon/test_helpers.py +++ b/tests/test_recon/test_helpers.py @@ -18,6 +18,7 @@ def test_get_scans(): "SearchsploitScan", "ThreadedNmapScan", "WebanalyzeScan", + "WaybackurlsScan", ] assert len(scan_names) == len(scans.keys()) diff --git a/tests/test_shell/test_recon_pipeline_shell.py b/tests/test_shell/test_recon_pipeline_shell.py index f80971c..d4937eb 100644 --- a/tests/test_shell/test_recon_pipeline_shell.py +++ b/tests/test_shell/test_recon_pipeline_shell.py @@ -440,6 +440,12 @@ class TestReconShell: ], "shell": True, }, + "waybackurls": { + "installed": True, + "depencencies": ["go"], + "commands": ["/usr/local/go/bin/go get github.com/tomnomnom/waybackurls"], + "shell": True, + }, } tooldir = tmp_path / ".local" / "recon-pipeline" / "tools" diff --git a/tests/test_web/test_waybackurls.py b/tests/test_web/test_waybackurls.py new file mode 100644 index 0000000..1d792e9 --- /dev/null +++ b/tests/test_web/test_waybackurls.py @@ -0,0 +1,57 @@ +import shutil +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +from pipeline.recon.web import WaybackurlsScan, GatherWebTargets + + +class TestGatherWebTargets: + def setup_method(self): + self.tmp_path = Path(tempfile.mkdtemp()) + self.scan = WaybackurlsScan( + target_file=__file__, results_dir=str(self.tmp_path), db_location=str(self.tmp_path / "testing.sqlite") + ) + + def teardown_method(self): + shutil.rmtree(self.tmp_path) + + def test_scan_requires(self): + with patch("pipeline.recon.web.GatherWebTargets"): + retval = self.scan.requires() + assert isinstance(retval, GatherWebTargets) + + def test_scan_creates_database(self): + assert self.scan.db_mgr.location.exists() + assert self.tmp_path / "testing.sqlite" == self.scan.db_mgr.location + + def test_scan_creates_results_dir(self): + assert self.scan.results_subfolder == self.tmp_path / "waybackurls-results" + + def test_scan_run(self): + with patch("subprocess.run", autospec=True) as mocked_run: + self.scan.results_subfolder = self.tmp_path / "waybackurls-results" + + self.scan.db_mgr.get_all_hostnames = MagicMock() + self.scan.db_mgr.get_all_hostnames.return_value = ["google.com"] + + completed_process_mock = MagicMock() + completed_process_mock.stdout.return_value = b"https://drive.google.com\nhttps://maps.google.com\n\n" + completed_process_mock.stdout.decode.return_value = "https://drive.google.com\nhttps://maps.google.com\n\n" + completed_process_mock.stdout.decode.splitlines.return_value = [ + "https://drive.google.com", + "https://maps.google.com", + ] + + mocked_run.return_value = completed_process_mock + + self.scan.db_mgr.add = MagicMock() + self.scan.db_mgr.get_or_create = MagicMock() + self.scan.db_mgr.get_or_create_target_by_ip_or_hostname = MagicMock() + + self.scan.run() + + assert mocked_run.called + assert self.scan.db_mgr.add.called + assert self.scan.db_mgr.get_or_create.called + assert self.scan.db_mgr.get_or_create_target_by_ip_or_hostname.called