mirror of
https://github.com/aljazceru/recon-pipeline.git
synced 2025-12-21 08:14:21 +01:00
WIP: add waybackurls scan (#56)
* fixed up config.defaults definition tools-dir and database-dir now use defaults.home value * added tool definition file; closes #54 * added basic PoC for waybackurls scanner; updated helpers.py test * added Endpoint/Target parsing; updated existing tests to pass * added tests for waybackurls * added WaybackurlsScan to FullScan * added documenation for WaybackurlsScan
This commit is contained in:
117
pipeline/recon/web/waybackurls.py
Normal file
117
pipeline/recon/web/waybackurls.py
Normal file
@@ -0,0 +1,117 @@
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import luigi
|
||||
from luigi.util import inherits
|
||||
from luigi.contrib.sqla import SQLAlchemyTarget
|
||||
|
||||
from .targets import GatherWebTargets
|
||||
from ...tools import tools
|
||||
from ...models.endpoint_model import Endpoint
|
||||
|
||||
import pipeline.models.db_manager
|
||||
|
||||
|
||||
@inherits(GatherWebTargets)
|
||||
class WaybackurlsScan(luigi.Task):
|
||||
""" Fetch known URLs from the Wayback Machine, Common Crawl, and Virus Total for historic data about the target.
|
||||
|
||||
Install:
|
||||
.. code-block:: console
|
||||
|
||||
go get github.com/tomnomnom/waybackurls
|
||||
|
||||
Basic Example:
|
||||
``waybackurls`` commands are structured like the example below.
|
||||
|
||||
``cat domains.txt | waybackurls > urls``
|
||||
|
||||
Luigi Example:
|
||||
.. code-block:: python
|
||||
|
||||
PYTHONPATH=$(pwd) luigi --local-scheduler --module recon.web.waybackurls WaybackurlsScan --target-file tesla --top-ports 1000
|
||||
|
||||
Args:
|
||||
db_location: specifies the path to the database used for storing results *Required by upstream Task*
|
||||
exempt_list: Path to a file providing blacklisted subdomains, one per line. *Optional by upstream Task*
|
||||
top_ports: Scan top N most popular ports *Required by upstream Task*
|
||||
ports: specifies the port(s) to be scanned *Required by upstream Task*
|
||||
interface: use the named raw network interface, such as "eth0" *Required by upstream Task*
|
||||
rate: desired rate for transmitting packets (packets per second) *Required by upstream Task*
|
||||
target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task*
|
||||
results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task*
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.db_mgr = pipeline.models.db_manager.DBManager(db_location=self.db_location)
|
||||
self.results_subfolder = Path(self.results_dir) / "waybackurls-results"
|
||||
|
||||
def requires(self):
|
||||
""" WaybackurlsScan depends on GatherWebTargets to run.
|
||||
|
||||
GatherWebTargets accepts exempt_list and expects rate, target_file, interface,
|
||||
and either ports or top_ports as parameters
|
||||
|
||||
Returns:
|
||||
luigi.Task - GatherWebTargets
|
||||
"""
|
||||
args = {
|
||||
"results_dir": self.results_dir,
|
||||
"rate": self.rate,
|
||||
"target_file": self.target_file,
|
||||
"top_ports": self.top_ports,
|
||||
"interface": self.interface,
|
||||
"ports": self.ports,
|
||||
"exempt_list": self.exempt_list,
|
||||
"db_location": self.db_location,
|
||||
}
|
||||
return GatherWebTargets(**args)
|
||||
|
||||
def output(self):
|
||||
""" Returns the target output for this task.
|
||||
|
||||
Returns:
|
||||
luigi.contrib.sqla.SQLAlchemyTarget
|
||||
"""
|
||||
return SQLAlchemyTarget(
|
||||
connection_string=self.db_mgr.connection_string, target_table="endpoint", update_id=self.task_id
|
||||
)
|
||||
|
||||
def run(self):
|
||||
""" Defines the options/arguments sent to waybackurls after processing. """
|
||||
self.results_subfolder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
command = [tools.get("waybackurls").get("path")]
|
||||
|
||||
waybackurls_input_file = self.results_subfolder / "input-from-webtargets"
|
||||
|
||||
with open(waybackurls_input_file, "w") as f:
|
||||
for target in self.db_mgr.get_all_hostnames():
|
||||
f.write(f"{target}\n")
|
||||
|
||||
with open(waybackurls_input_file) as target_list:
|
||||
proc = subprocess.run(command, stdin=target_list, stdout=subprocess.PIPE)
|
||||
|
||||
for url in proc.stdout.decode().splitlines():
|
||||
if not url:
|
||||
continue
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
|
||||
# get Target, may exist already or not
|
||||
ip_or_hostname = parsed_url.hostname
|
||||
tgt = self.db_mgr.get_or_create_target_by_ip_or_hostname(ip_or_hostname)
|
||||
|
||||
endpoint = self.db_mgr.get_or_create(Endpoint, url=url, target=tgt)
|
||||
|
||||
if endpoint not in tgt.endpoints:
|
||||
tgt.endpoints.append(endpoint)
|
||||
|
||||
self.db_mgr.add(tgt)
|
||||
self.db_mgr.add(endpoint)
|
||||
|
||||
self.output().touch()
|
||||
|
||||
waybackurls_input_file.unlink()
|
||||
Reference in New Issue
Block a user