mirror of
https://github.com/Cisco-Talos/clamav.git
synced 2025-10-19 10:23:17 +00:00
Fix possible crash in HTML CSS image extraction
When processing UTF-8 HTML code, the image extraction logic may panic if the string contains a multi-byte grapheme that includes a '(', ')', whitespace, or one of the other characters used to split the text when searching for the base64 image content. The panic is because the `split_at()` method will panic if you try to split in the middle of a unicode grapheme. This commit fixes the issue by processing the HTML string one grapheme at a time instead of one character (byte) at a time. The `grapheme_indices()` method is used to get the correct position of the start of each grapheme for splitting the string.
This commit is contained in:
parent
7a6fe78172
commit
2a21451e1f
4 changed files with 36 additions and 27 deletions
7
Cargo.lock
generated
7
Cargo.lock
generated
|
@ -170,6 +170,7 @@ dependencies = [
|
|||
"tempfile",
|
||||
"thiserror",
|
||||
"transpose",
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -1170,6 +1171,12 @@ version = "1.0.8"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
version = "1.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.10"
|
||||
|
|
|
@ -18,6 +18,7 @@ transpose = "0.2"
|
|||
num-traits = "0.2"
|
||||
base64 = "0.21.0"
|
||||
sha1 = "0.10.5"
|
||||
unicode-segmentation = "1.10.1"
|
||||
|
||||
[lib]
|
||||
crate-type = ["staticlib"]
|
||||
|
|
|
@ -1032,7 +1032,7 @@ fn process_line(ctx: &mut Context, line: &[u8]) -> Result<(), InputError> {
|
|||
cmd_unlink(ctx, unlink_op)
|
||||
}
|
||||
_ => Err(InputError::UnknownCommand(
|
||||
String::from_utf8_lossy(&cmd).to_string(),
|
||||
String::from_utf8_lossy(cmd).to_string(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@ use std::{ffi::CStr, mem::ManuallyDrop, os::raw::c_char};
|
|||
use base64::{engine::general_purpose as base64_engine_standard, Engine as _};
|
||||
use log::{debug, error, warn};
|
||||
use thiserror::Error;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
use crate::sys;
|
||||
|
||||
|
@ -73,12 +74,12 @@ impl<'a> CssImageExtractor<'a> {
|
|||
};
|
||||
|
||||
// Skip whitespace until we find '('
|
||||
for (pos, c) in self.remaining.chars().enumerate() {
|
||||
if c == '(' {
|
||||
for (pos, c) in self.remaining.grapheme_indices(true) {
|
||||
if c == "(" {
|
||||
// Found left-paren.
|
||||
(_, self.remaining) = self.remaining.split_at(pos + 1);
|
||||
break;
|
||||
} else if char::is_whitespace(c) {
|
||||
} else if c.contains(char::is_whitespace) {
|
||||
// Skipping whitespace.
|
||||
continue;
|
||||
} else {
|
||||
|
@ -90,11 +91,11 @@ impl<'a> CssImageExtractor<'a> {
|
|||
// Find closing ')'
|
||||
let mut depth = 1;
|
||||
let mut url_parameter: Option<&str> = None;
|
||||
for (pos, c) in self.remaining.chars().enumerate() {
|
||||
if c == '(' {
|
||||
for (pos, c) in self.remaining.grapheme_indices(true) {
|
||||
if c == "(" {
|
||||
// Found nested left-paren.
|
||||
depth += 1;
|
||||
} else if c == ')' {
|
||||
} else if c == ")" {
|
||||
if depth > 1 {
|
||||
// Found nested right-paren.
|
||||
depth -= 1;
|
||||
|
@ -121,8 +122,8 @@ impl<'a> CssImageExtractor<'a> {
|
|||
// Strip optional whitespace and quotes from front and back.
|
||||
|
||||
// Trim off whitespace at beginning
|
||||
for (pos, c) in url_parameter.chars().enumerate() {
|
||||
if char::is_whitespace(c) {
|
||||
for (pos, c) in url_parameter.grapheme_indices(true) {
|
||||
if c.contains(char::is_whitespace) {
|
||||
// Skipping whitespace before url contents.
|
||||
continue;
|
||||
} else {
|
||||
|
@ -132,8 +133,8 @@ impl<'a> CssImageExtractor<'a> {
|
|||
}
|
||||
|
||||
// Trim off whitespace at end
|
||||
for (pos, c) in url_parameter.chars().rev().enumerate() {
|
||||
if char::is_whitespace(c) {
|
||||
for (pos, c) in url_parameter.graphemes(true).rev().enumerate() {
|
||||
if c.contains(char::is_whitespace) {
|
||||
// Skipping whitespace after url contents.
|
||||
continue;
|
||||
} else {
|
||||
|
@ -143,24 +144,24 @@ impl<'a> CssImageExtractor<'a> {
|
|||
}
|
||||
|
||||
// Trim off " at beginning.
|
||||
let c = url_parameter.chars().next();
|
||||
let c = url_parameter.graphemes(true).next();
|
||||
if let Some(c) = c {
|
||||
if c == '"' {
|
||||
if c == "\"" {
|
||||
(_, url_parameter) = url_parameter.split_at(1);
|
||||
}
|
||||
};
|
||||
|
||||
// Trim off " at end.
|
||||
let c = url_parameter.chars().rev().next();
|
||||
let c = url_parameter.graphemes(true).rev().next();
|
||||
if let Some(c) = c {
|
||||
if c == '"' {
|
||||
if c == "\"" {
|
||||
(url_parameter, _) = url_parameter.split_at(url_parameter.len() - 1);
|
||||
}
|
||||
};
|
||||
|
||||
// Trim off whitespace at beginning.
|
||||
for (pos, c) in url_parameter.chars().enumerate() {
|
||||
if char::is_whitespace(c) {
|
||||
for (pos, c) in url_parameter.grapheme_indices(true) {
|
||||
if c.contains(char::is_whitespace) {
|
||||
// Skipping whitespace before url contents.
|
||||
continue;
|
||||
} else {
|
||||
|
@ -170,8 +171,8 @@ impl<'a> CssImageExtractor<'a> {
|
|||
}
|
||||
|
||||
// Trim off whitespace at end.
|
||||
for (pos, c) in url_parameter.chars().rev().enumerate() {
|
||||
if char::is_whitespace(c) {
|
||||
for (pos, c) in url_parameter.graphemes(true).rev().enumerate() {
|
||||
if c.contains(char::is_whitespace) {
|
||||
// Skipping whitespace after url contents.
|
||||
continue;
|
||||
} else {
|
||||
|
@ -203,12 +204,12 @@ impl<'a> CssImageExtractor<'a> {
|
|||
};
|
||||
|
||||
// Skip whitespace until we find a 'b' (starting "base64")
|
||||
for (pos, c) in url_parameter.chars().enumerate() {
|
||||
if c == 'b' {
|
||||
for (pos, c) in url_parameter.grapheme_indices(true) {
|
||||
if c == "b" {
|
||||
// Found 'b'.
|
||||
(_, url_parameter) = url_parameter.split_at(pos + 1);
|
||||
break;
|
||||
} else if char::is_whitespace(c) {
|
||||
} else if c.contains(char::is_whitespace) {
|
||||
// Skipping whitespace.
|
||||
continue;
|
||||
} else {
|
||||
|
@ -227,12 +228,12 @@ impl<'a> CssImageExtractor<'a> {
|
|||
(_, url_parameter) = url_parameter.split_at("ase64".len());
|
||||
|
||||
// Skip whitespace until we find ','
|
||||
for (pos, c) in url_parameter.chars().enumerate() {
|
||||
if c == ',' {
|
||||
for (pos, c) in url_parameter.grapheme_indices(true) {
|
||||
if c == "," {
|
||||
// Found ','.
|
||||
(_, url_parameter) = url_parameter.split_at(pos + 1);
|
||||
break;
|
||||
} else if char::is_whitespace(c) {
|
||||
} else if c.contains(char::is_whitespace) {
|
||||
// Skipping whitespace.
|
||||
continue;
|
||||
} else {
|
||||
|
@ -242,8 +243,8 @@ impl<'a> CssImageExtractor<'a> {
|
|||
}
|
||||
|
||||
// Trim off whitespace at beginning.
|
||||
for (pos, c) in url_parameter.chars().enumerate() {
|
||||
if char::is_whitespace(c) {
|
||||
for (pos, c) in url_parameter.grapheme_indices(true) {
|
||||
if c.contains(char::is_whitespace) {
|
||||
// Skipping whitespace before url contents.
|
||||
continue;
|
||||
} else {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue