Security review fixes
This commit is contained in:
@@ -64,6 +64,19 @@ HTML_TAG_RE = re.compile(r"<[^>]+>")
|
||||
CODE_BLOCK_RE = re.compile(r"```[\s\S]*?```", re.MULTILINE)
|
||||
MULTI_WHITESPACE_RE = re.compile(r"\s+")
|
||||
MAX_CHUNK_LEN = 2000
|
||||
INJECTION_PATTERNS = [
|
||||
r"\x00", # Null bytes
|
||||
r"\x1a", # EOF character
|
||||
r"--\s", # SQL comment
|
||||
r"/\*[\s\S]*?\*/", # SQL comment
|
||||
r"';", # SQL injection
|
||||
r"\b(DROP|DELETE|INSERT|UPDATE|SELECT)\b", # SQL keywords
|
||||
r"<script[^>]*>.*?</script>", # XSS
|
||||
r"javascript:", # JS injection
|
||||
r"\b(eval|exec|spawn|fork|system)\b", # Code execution
|
||||
]
|
||||
|
||||
MAX_QUERY_LENGTH = 1000
|
||||
|
||||
|
||||
def sanitize_text(raw: str) -> str:
|
||||
@@ -86,6 +99,25 @@ def sanitize_text(raw: str) -> str:
|
||||
if len(text) > MAX_CHUNK_LEN:
|
||||
text = text[:MAX_CHUNK_LEN]
|
||||
return text
|
||||
"""Sanitize raw vault content before embedding.
|
||||
|
||||
- Strip HTML tags (prevent XSS)
|
||||
- Remove fenced code blocks
|
||||
- Normalize whitespace
|
||||
- Cap length at MAX_CHUNK_LEN chars
|
||||
"""
|
||||
# Remove fenced code blocks
|
||||
text = CODE_BLOCK_RE.sub(" ", raw)
|
||||
# Strip HTML tags
|
||||
text = HTML_TAG_RE.sub("", text)
|
||||
# Remove leading/trailing whitespace
|
||||
text = text.strip()
|
||||
# Normalize internal whitespace
|
||||
text = MULTI_WHITESPACE_RE.sub(" ", text)
|
||||
# Cap length
|
||||
if len(text) > MAX_CHUNK_LEN:
|
||||
text = text[:MAX_CHUNK_LEN]
|
||||
return text
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
@@ -93,6 +125,26 @@ def sanitize_text(raw: str) -> str:
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def sanitize_query(query: str) -> str:
|
||||
"""Sanitize search query to prevent prompt injection.
|
||||
|
||||
- Remove injection patterns
|
||||
- Normalize whitespace
|
||||
- Limit length
|
||||
"""
|
||||
# Remove injection patterns
|
||||
for pattern in INJECTION_PATTERNS:
|
||||
query = re.sub(pattern, " ", query, flags=re.IGNORECASE)
|
||||
|
||||
# Normalize whitespace
|
||||
query = MULTI_WHITESPACE_RE.sub(" ", query.strip())
|
||||
|
||||
# Limit length
|
||||
if len(query) > MAX_QUERY_LENGTH:
|
||||
query = query[:MAX_QUERY_LENGTH]
|
||||
|
||||
return query
|
||||
|
||||
def detect_sensitive(
|
||||
text: str,
|
||||
sensitive_sections: list[str],
|
||||
|
||||
Reference in New Issue
Block a user