Use pybluemonday instead of lxml for html sanitization (#1837)

* Use `pybluemonday` instead of `lxml` for html sanitization
* Fix boolean optional configs in `config.py`
* Closes #1835
This commit is contained in:
Kevin Chung
2021-03-19 01:29:49 -04:00
committed by GitHub
parent 8de9819bd4
commit a045114251
5 changed files with 109 additions and 67 deletions

View File

@@ -46,12 +46,16 @@
**Deployment** **Deployment**
- Fix boolean configs from the `config.ini` optional section
- Install `python3-dev` instead of `python-dev` in apt - Install `python3-dev` instead of `python-dev` in apt
- Bump lxml to 4.6.2 - Require `pybluemonday` as pip dependency
- Remove `lxml` and `html5lib` from pip dependencies
- Bump pip-compile to 5.4.0 - Bump pip-compile to 5.4.0
**Miscellaneous** **Miscellaneous**
- Rewrite the HTML santiziation feature (controlled by `HTML_SANITIZATION`) to use the pybluemonday library instead of lxml/html5lib
- Note that this feature is still in beta
- Cache Docker builds more by copying and installing Python dependencies before copying CTFd - Cache Docker builds more by copying and installing Python dependencies before copying CTFd
- Change the default emails slightly and rework confirmation email page to make some recommendations clearer - Change the default emails slightly and rework confirmation email page to make some recommendations clearer
- Use `examplectf.com` as testing/development domain instead of `ctfd.io` - Use `examplectf.com` as testing/development domain instead of `ctfd.io`

View File

@@ -174,25 +174,25 @@ class ServerConfig(object):
AWS_S3_ENDPOINT_URL: str = empty_str_cast(config_ini["uploads"]["AWS_S3_ENDPOINT_URL"]) AWS_S3_ENDPOINT_URL: str = empty_str_cast(config_ini["uploads"]["AWS_S3_ENDPOINT_URL"])
# === OPTIONAL === # === OPTIONAL ===
REVERSE_PROXY: bool = empty_str_cast(config_ini["optional"]["REVERSE_PROXY"], default=False) REVERSE_PROXY: bool = process_boolean_str(empty_str_cast(config_ini["optional"]["REVERSE_PROXY"], default=False))
TEMPLATES_AUTO_RELOAD: bool = empty_str_cast(config_ini["optional"]["TEMPLATES_AUTO_RELOAD"], default=True) TEMPLATES_AUTO_RELOAD: bool = process_boolean_str(empty_str_cast(config_ini["optional"]["TEMPLATES_AUTO_RELOAD"], default=True))
THEME_FALLBACK: bool = empty_str_cast(config_ini["optional"]["THEME_FALLBACK"], default=False) THEME_FALLBACK: bool = process_boolean_str(empty_str_cast(config_ini["optional"]["THEME_FALLBACK"], default=False))
SQLALCHEMY_TRACK_MODIFICATIONS: bool = empty_str_cast(config_ini["optional"]["SQLALCHEMY_TRACK_MODIFICATIONS"], default=False) SQLALCHEMY_TRACK_MODIFICATIONS: bool = process_boolean_str(empty_str_cast(config_ini["optional"]["SQLALCHEMY_TRACK_MODIFICATIONS"], default=False))
SWAGGER_UI: bool = empty_str_cast(config_ini["optional"]["SWAGGER_UI"], default=False) SWAGGER_UI: bool = process_boolean_str(empty_str_cast(config_ini["optional"]["SWAGGER_UI"], default=False))
SWAGGER_UI_ENDPOINT: str = "/" if SWAGGER_UI else None SWAGGER_UI_ENDPOINT: str = "/" if SWAGGER_UI else None
UPDATE_CHECK: bool = empty_str_cast(config_ini["optional"]["UPDATE_CHECK"], default=True) UPDATE_CHECK: bool = process_boolean_str(empty_str_cast(config_ini["optional"]["UPDATE_CHECK"], default=True))
APPLICATION_ROOT: str = empty_str_cast(config_ini["optional"]["APPLICATION_ROOT"], default="/") APPLICATION_ROOT: str = empty_str_cast(config_ini["optional"]["APPLICATION_ROOT"], default="/")
SERVER_SENT_EVENTS: bool = empty_str_cast(config_ini["optional"]["SERVER_SENT_EVENTS"], default=True) SERVER_SENT_EVENTS: bool = process_boolean_str(empty_str_cast(config_ini["optional"]["SERVER_SENT_EVENTS"], default=True))
HTML_SANITIZATION: bool = empty_str_cast(config_ini["optional"]["HTML_SANITIZATION"], default=False) HTML_SANITIZATION: bool = process_boolean_str(empty_str_cast(config_ini["optional"]["HTML_SANITIZATION"], default=False))
if DATABASE_URL.startswith("sqlite") is False: if DATABASE_URL.startswith("sqlite") is False:
SQLALCHEMY_ENGINE_OPTIONS = { SQLALCHEMY_ENGINE_OPTIONS = {

View File

@@ -1,56 +1,97 @@
# Bandit complains about security issues with lxml. from pybluemonday import UGCPolicy
# These issues have been addressed in the past and do not apply to parsing HTML.
from lxml.html import html5parser, tostring # nosec B410
from lxml.html.clean import Cleaner # nosec B410
from lxml.html.defs import safe_attrs # nosec B410
cleaner = Cleaner( # Copied from lxml:
comments=False, # https://github.com/lxml/lxml/blob/e986a9cb5d54827c59aefa8803bc90954d67221e/src/lxml/html/defs.py#L38
page_structure=False, # fmt: off
embedded=False, SAFE_ATTRS = (
frames=False, 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
forms=False, 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
links=False, 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
meta=False, 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
style=False, 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
safe_attrs=( 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
safe_attrs 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
| { 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
"style", 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
# Allow data attributes from bootstrap elements 'type', 'usemap', 'valign', 'value', 'vspace', 'width'
"data-toggle",
"data-target",
"data-dismiss",
"data-spy",
"data-offset",
"data-html",
"data-placement",
"data-parent",
"data-title",
"data-template",
"data-interval",
"data-keyboard",
"data-pause",
"data-ride",
"data-wrap",
"data-touch",
"data-flip",
"data-boundary",
"data-reference",
"data-display",
"data-animation",
"data-container",
"data-delay",
"data-selector",
"data-content",
"data-trigger",
}
),
annoying_tags=False,
) )
# fmt: on
PAGE_STRUCTURE_TAGS = {
"title": [],
}
META_TAGS = {
"meta": ["name", "content", "property"],
}
FORM_TAGS = {
"form": ["method", "action"],
"button": ["name", "type", "value", "disabled"],
"input": ["name", "type", "value", "placeholder"],
"select": ["name", "value", "placeholder"],
"option": ["value"],
"textarea": ["name", "value", "placeholder"],
"label": ["for"],
}
ANNOYING_TAGS = {
"blink": [],
"marquee": [],
}
MEDIA_TAGS = {
"audio": ["autoplay", "controls", "crossorigin", "loop", "muted", "preload", "src"],
"video": [
"autoplay",
"buffered",
"controls",
"crossorigin",
"loop",
"muted",
"playsinline",
"poster",
"preload",
"src",
],
"source": ["src", "type"],
"iframe": ["width", "height", "src", "frameborder", "allow", "allowfullscreen"],
}
SANITIZER = UGCPolicy()
for TAGS in (PAGE_STRUCTURE_TAGS, META_TAGS, FORM_TAGS, ANNOYING_TAGS, MEDIA_TAGS):
for element in TAGS:
SANITIZER.AllowElements(element)
SANITIZER.AllowAttrs(*TAGS[element]).OnElements(element)
# Allow safe attrs copied from lxml
SANITIZER.AllowAttrs(*SAFE_ATTRS).Globally()
# Allow styling globally
SANITIZER.AllowAttrs("class", "style").Globally()
# Allow styling via bluemonday
SANITIZER.AllowStyling()
# Allow safe convenience functions from bluemonday
SANITIZER.AllowStandardAttributes()
SANITIZER.AllowStandardURLs()
# Allow data atributes
SANITIZER.AllowDataAttributes()
# Allow data URI images
SANITIZER.AllowDataURIImages()
# Link security
SANITIZER.AllowRelativeURLs(True)
SANITIZER.RequireNoFollowOnFullyQualifiedLinks(True)
SANITIZER.RequireNoFollowOnLinks(True)
SANITIZER.RequireNoReferrerOnFullyQualifiedLinks(True)
SANITIZER.RequireNoReferrerOnLinks(True)
def sanitize_html(html): def sanitize_html(html):
html = html5parser.fragment_fromstring(html, create_parent="div") return SANITIZER.sanitize(html)
html = cleaner.clean_html(tostring(html)).decode()
return html

View File

@@ -24,9 +24,8 @@ marshmallow-sqlalchemy==0.17.0
boto3==1.13.9 boto3==1.13.9
marshmallow==2.20.2 marshmallow==2.20.2
pydantic==1.5.1 pydantic==1.5.1
lxml==4.6.2
html5lib==1.0.1
WTForms==2.3.1 WTForms==2.3.1
python-geoacumen==0.0.1 python-geoacumen==0.0.1
maxminddb==1.5.4 maxminddb==1.5.4
tenacity==6.2.0 tenacity==6.2.0
pybluemonday==0.0.3

View File

@@ -11,7 +11,7 @@ bcrypt==3.1.7 # via -r requirements.in
boto3==1.13.9 # via -r requirements.in boto3==1.13.9 # via -r requirements.in
botocore==1.16.26 # via boto3, s3transfer botocore==1.16.26 # via boto3, s3transfer
certifi==2020.11.8 # via requests certifi==2020.11.8 # via requests
cffi==1.14.3 # via bcrypt, cmarkgfm cffi==1.14.3 # via bcrypt, cmarkgfm, pybluemonday
chardet==3.0.4 # via requests chardet==3.0.4 # via requests
click==7.1.2 # via flask click==7.1.2 # via flask
cmarkgfm==0.4.2 # via -r requirements.in cmarkgfm==0.4.2 # via -r requirements.in
@@ -27,20 +27,19 @@ flask==1.1.2 # via -r requirements.in, flask-caching, flask-marshma
gevent==20.9.0 # via -r requirements.in gevent==20.9.0 # via -r requirements.in
greenlet==0.4.17 # via gevent greenlet==0.4.17 # via gevent
gunicorn==20.0.4 # via -r requirements.in gunicorn==20.0.4 # via -r requirements.in
html5lib==1.0.1 # via -r requirements.in
idna==2.10 # via requests idna==2.10 # via requests
importlib-metadata==2.0.0 # via jsonschema importlib-metadata==2.0.0 # via jsonschema
itsdangerous==1.1.0 # via -r requirements.in, flask itsdangerous==1.1.0 # via -r requirements.in, flask
jinja2==2.11.2 # via -r requirements.in, flask jinja2==2.11.2 # via -r requirements.in, flask
jmespath==0.10.0 # via boto3, botocore jmespath==0.10.0 # via boto3, botocore
jsonschema==3.2.0 # via flask-restx jsonschema==3.2.0 # via flask-restx
lxml==4.6.2 # via -r requirements.in
mako==1.1.3 # via alembic mako==1.1.3 # via alembic
markupsafe==1.1.1 # via jinja2, mako, wtforms markupsafe==1.1.1 # via jinja2, mako, wtforms
marshmallow-sqlalchemy==0.17.0 # via -r requirements.in marshmallow-sqlalchemy==0.17.0 # via -r requirements.in
marshmallow==2.20.2 # via -r requirements.in, flask-marshmallow, marshmallow-sqlalchemy marshmallow==2.20.2 # via -r requirements.in, flask-marshmallow, marshmallow-sqlalchemy
maxminddb==1.5.4 # via -r requirements.in, python-geoacumen maxminddb==1.5.4 # via -r requirements.in, python-geoacumen
passlib==1.7.2 # via -r requirements.in passlib==1.7.2 # via -r requirements.in
pybluemonday==0.0.3 # via -r requirements.in
pycparser==2.20 # via cffi pycparser==2.20 # via cffi
pydantic==1.5.1 # via -r requirements.in pydantic==1.5.1 # via -r requirements.in
pymysql==0.9.3 # via -r requirements.in pymysql==0.9.3 # via -r requirements.in
@@ -53,12 +52,11 @@ pytz==2020.4 # via flask-restx
redis==3.5.2 # via -r requirements.in redis==3.5.2 # via -r requirements.in
requests==2.23.0 # via -r requirements.in requests==2.23.0 # via -r requirements.in
s3transfer==0.3.3 # via boto3 s3transfer==0.3.3 # via boto3
six==1.15.0 # via bcrypt, flask-marshmallow, flask-restx, html5lib, jsonschema, python-dateutil, sqlalchemy-utils, tenacity six==1.15.0 # via bcrypt, flask-marshmallow, flask-restx, jsonschema, python-dateutil, sqlalchemy-utils, tenacity
sqlalchemy-utils==0.36.6 # via -r requirements.in sqlalchemy-utils==0.36.6 # via -r requirements.in
sqlalchemy==1.3.17 # via -r requirements.in, alembic, dataset, flask-sqlalchemy, marshmallow-sqlalchemy, sqlalchemy-utils sqlalchemy==1.3.17 # via -r requirements.in, alembic, dataset, flask-sqlalchemy, marshmallow-sqlalchemy, sqlalchemy-utils
tenacity==6.2.0 # via -r requirements.in tenacity==6.2.0 # via -r requirements.in
urllib3==1.25.11 # via botocore, requests urllib3==1.25.11 # via botocore, requests
webencodings==0.5.1 # via html5lib
werkzeug==1.0.1 # via -r requirements.in, flask, flask-restx werkzeug==1.0.1 # via -r requirements.in, flask, flask-restx
wtforms==2.3.1 # via -r requirements.in wtforms==2.3.1 # via -r requirements.in
zipp==3.4.0 # via importlib-metadata zipp==3.4.0 # via importlib-metadata