Files
recon-pipeline/pipeline/recon/web/waybackurls.py
Ryan Good d7dbd1e7b3 Dependency Checking (#75)
* Adds req testing methodology, needs fixes

* Improves dependency exception handling

* Better meets_requirements implementation

Still need to adjust tests to fake installation

* Changes to exception boolean to enable tool check

tests and class variables modified for new tool check

* Adjust test_get_scans to use appropriate variable

* Adds Go requirement where relevant

* Adds missing scan dependencies

* Add clarification to error message
2020-08-07 08:48:49 -05:00

123 lines
4.4 KiB
Python

import subprocess
from pathlib import Path
from urllib.parse import urlparse
import luigi
from luigi.util import inherits
from luigi.contrib.sqla import SQLAlchemyTarget
from .targets import GatherWebTargets
from ...tools import tools
from ..helpers import meets_requirements
from ...models.endpoint_model import Endpoint
import pipeline.models.db_manager
@inherits(GatherWebTargets)
class WaybackurlsScan(luigi.Task):
""" Fetch known URLs from the Wayback Machine, Common Crawl, and Virus Total for historic data about the target.
Install:
.. code-block:: console
go get github.com/tomnomnom/waybackurls
Basic Example:
``waybackurls`` commands are structured like the example below.
``cat domains.txt | waybackurls > urls``
Luigi Example:
.. code-block:: python
PYTHONPATH=$(pwd) luigi --local-scheduler --module recon.web.waybackurls WaybackurlsScan --target-file tesla --top-ports 1000
Args:
db_location: specifies the path to the database used for storing results *Required by upstream Task*
exempt_list: Path to a file providing blacklisted subdomains, one per line. *Optional by upstream Task*
top_ports: Scan top N most popular ports *Required by upstream Task*
ports: specifies the port(s) to be scanned *Required by upstream Task*
interface: use the named raw network interface, such as "eth0" *Required by upstream Task*
rate: desired rate for transmitting packets (packets per second) *Required by upstream Task*
target_file: specifies the file on disk containing a list of ips or domains *Required by upstream Task*
results_dir: specifes the directory on disk to which all Task results are written *Required by upstream Task*
"""
requirements = ["go", "waybackurls", "masscan"]
exception = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.db_mgr = pipeline.models.db_manager.DBManager(db_location=self.db_location)
self.results_subfolder = Path(self.results_dir) / "waybackurls-results"
def requires(self):
""" WaybackurlsScan depends on GatherWebTargets to run.
GatherWebTargets accepts exempt_list and expects rate, target_file, interface,
and either ports or top_ports as parameters
Returns:
luigi.Task - GatherWebTargets
"""
meets_requirements(self.requirements, self.exception)
args = {
"results_dir": self.results_dir,
"rate": self.rate,
"target_file": self.target_file,
"top_ports": self.top_ports,
"interface": self.interface,
"ports": self.ports,
"exempt_list": self.exempt_list,
"db_location": self.db_location,
}
return GatherWebTargets(**args)
def output(self):
""" Returns the target output for this task.
Returns:
luigi.contrib.sqla.SQLAlchemyTarget
"""
return SQLAlchemyTarget(
connection_string=self.db_mgr.connection_string, target_table="endpoint", update_id=self.task_id
)
def run(self):
""" Defines the options/arguments sent to waybackurls after processing. """
self.results_subfolder.mkdir(parents=True, exist_ok=True)
command = [tools.get("waybackurls").get("path")]
waybackurls_input_file = self.results_subfolder / "input-from-webtargets"
with open(waybackurls_input_file, "w") as f:
for target in self.db_mgr.get_all_hostnames():
f.write(f"{target}\n")
with open(waybackurls_input_file) as target_list:
proc = subprocess.run(command, stdin=target_list, stdout=subprocess.PIPE)
for url in proc.stdout.decode().splitlines():
if not url:
continue
parsed_url = urlparse(url)
# get Target, may exist already or not
ip_or_hostname = parsed_url.hostname
tgt = self.db_mgr.get_or_create_target_by_ip_or_hostname(ip_or_hostname)
endpoint = self.db_mgr.get_or_create(Endpoint, url=url, target=tgt)
if endpoint not in tgt.endpoints:
tgt.endpoints.append(endpoint)
self.db_mgr.add(tgt)
self.db_mgr.add(endpoint)
self.output().touch()
waybackurls_input_file.unlink()