mirror of
https://github.com/Cisco-Talos/clamav.git
synced 2025-10-19 18:33:16 +00:00
294 lines
6.1 KiB
C
294 lines
6.1 KiB
C
![]() |
/*
|
||
|
* Copyright (C) 2004 Trog <trog@clamav.net>
|
||
|
*
|
||
|
* This program is free software; you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation; either version 2 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* This program is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with this program; if not, write to the Free Software
|
||
|
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||
|
*/
|
||
|
|
||
|
#if HAVE_CONFIG_H
|
||
|
#include "clamav-config.h"
|
||
|
#endif
|
||
|
|
||
|
#include <stdio.h>
|
||
|
#include <unistd.h>
|
||
|
#include <sys/types.h>
|
||
|
#include <sys/stat.h>
|
||
|
#include <fcntl.h>
|
||
|
|
||
|
#include "others.h"
|
||
|
|
||
|
#define FALSE (0)
|
||
|
#define TRUE (1)
|
||
|
|
||
|
/* Normalize an HTML buffer using the following rules:
|
||
|
o Remove multiple contiguous spaces
|
||
|
o Remove spaces around '<' and '>' in tags
|
||
|
o Remove spaces around '=' in tags
|
||
|
o Replace single quote with double quote in tags
|
||
|
o Convert to lowercase
|
||
|
o Convert all white space to a space character
|
||
|
*/
|
||
|
|
||
|
unsigned char *html_normalize(unsigned char *in_buff, off_t in_size)
|
||
|
{
|
||
|
unsigned char *out_buff;
|
||
|
off_t out_size=0, i;
|
||
|
int had_space=FALSE, tag_depth=0, in_quote=FALSE;
|
||
|
|
||
|
out_buff = (unsigned char *) cli_malloc(in_size+1);
|
||
|
if (!out_buff) {
|
||
|
cli_dbgmsg("html_normalize(): malloc failed\n");
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
for (i=0 ; i < in_size ; i++) {
|
||
|
if (in_buff[i] == '<') {
|
||
|
out_buff[out_size++] = '<';
|
||
|
tag_depth++;
|
||
|
if (tag_depth == 1) {
|
||
|
had_space=TRUE; /* consume spaces */
|
||
|
}
|
||
|
} else if ((in_buff[i] == '=') && (tag_depth == 1)) {
|
||
|
/* Remove preceeding spaces */
|
||
|
while ((out_size > 0) &&
|
||
|
(out_buff[out_size-1] == ' ')) {
|
||
|
out_size--;
|
||
|
}
|
||
|
out_buff[out_size++] = '=';
|
||
|
had_space=TRUE;
|
||
|
} else if (isspace(in_buff[i])) {
|
||
|
if (!had_space) {
|
||
|
out_buff[out_size++] = ' ';
|
||
|
had_space=TRUE;
|
||
|
}
|
||
|
} else if (in_buff[i] == '>') {
|
||
|
/* Remove preceeding spaces */
|
||
|
if (tag_depth == 1) {
|
||
|
while ((out_size > 0) &&
|
||
|
(out_buff[out_size-1] == ' ')) {
|
||
|
out_size--;
|
||
|
}
|
||
|
}
|
||
|
out_buff[out_size++] = '>';
|
||
|
tag_depth--;
|
||
|
} else if ((in_buff[i] == '\'') && (tag_depth==1)) {
|
||
|
/* Convert single quotes to double quotes */
|
||
|
if (in_quote || out_buff[out_size-1] == '=') {
|
||
|
out_buff[out_size++] = '\"';
|
||
|
in_quote = !in_quote;
|
||
|
} else {
|
||
|
out_buff[out_size++] = '\'';
|
||
|
}
|
||
|
} else {
|
||
|
out_buff[out_size++] = tolower(in_buff[i]);
|
||
|
had_space=FALSE;
|
||
|
}
|
||
|
}
|
||
|
out_buff[out_size] = '\0';
|
||
|
return out_buff;
|
||
|
}
|
||
|
|
||
|
/* Remove HTML style comments from buffer */
|
||
|
unsigned char *remove_html_comments(unsigned char *line)
|
||
|
{
|
||
|
unsigned char *newline, *newcurrent;
|
||
|
int in_comment=FALSE;
|
||
|
|
||
|
if (!line) {
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);
|
||
|
if (!newline) {
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
while(line) {
|
||
|
if (!(in_comment)) {
|
||
|
while (*line && *line != '<') {
|
||
|
*newcurrent = *line;
|
||
|
newcurrent++;
|
||
|
line++;
|
||
|
}
|
||
|
if (! *line) {
|
||
|
break;
|
||
|
}
|
||
|
if (!line[1]) {
|
||
|
*newcurrent = *line;
|
||
|
newcurrent++;
|
||
|
line++;
|
||
|
continue;
|
||
|
}
|
||
|
if (line[1] == '!') {
|
||
|
in_comment = TRUE;
|
||
|
line += 1;
|
||
|
} else {
|
||
|
*newcurrent = *line;
|
||
|
newcurrent++;
|
||
|
line++;
|
||
|
}
|
||
|
} else {
|
||
|
while (*line && *line != '>') {
|
||
|
line++;
|
||
|
}
|
||
|
if (! *line) {
|
||
|
break;
|
||
|
}
|
||
|
in_comment = FALSE;
|
||
|
line++;
|
||
|
}
|
||
|
}
|
||
|
*newcurrent = '\0';
|
||
|
return newline;
|
||
|
}
|
||
|
|
||
|
/* Decode an HTML escape character into it's character value */
|
||
|
unsigned int decode_html_char_ref(unsigned char *cref,
|
||
|
unsigned char *dest)
|
||
|
{
|
||
|
|
||
|
unsigned int hex=FALSE, value=0, count=0;
|
||
|
|
||
|
if (!cref[0] || !cref[1]) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
if (((*cref == 'x') || (*cref == 'X')) && isxdigit(cref[1])) {
|
||
|
hex=TRUE;
|
||
|
cref++;
|
||
|
count++;
|
||
|
}
|
||
|
|
||
|
while (isdigit(*cref) || (hex && isxdigit(*cref))) {
|
||
|
if (hex) {
|
||
|
value *= 16;
|
||
|
} else {
|
||
|
value *= 10;
|
||
|
}
|
||
|
value += (*cref - '0');
|
||
|
cref++;
|
||
|
count++;
|
||
|
}
|
||
|
if (*cref == ';') {
|
||
|
cref++;
|
||
|
count++;
|
||
|
}
|
||
|
|
||
|
*dest = value;
|
||
|
|
||
|
return count;
|
||
|
}
|
||
|
|
||
|
/* Remove HTML character escape sequences from buffer */
|
||
|
unsigned char *remove_html_char_ref(unsigned char *line)
|
||
|
{
|
||
|
unsigned char *newline, *newcurrent;
|
||
|
unsigned char *linepos, count;
|
||
|
|
||
|
if (!line) {
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
newcurrent = newline = (unsigned char *) cli_malloc(strlen(line) + 1);
|
||
|
if (!newline) {
|
||
|
return NULL;
|
||
|
}
|
||
|
while (line) {
|
||
|
linepos = strchr(line, '&');
|
||
|
if (!linepos) {
|
||
|
strcpy(newcurrent, line);
|
||
|
return newline;
|
||
|
}
|
||
|
strncpy(newcurrent, line, linepos-line);
|
||
|
newcurrent += linepos-line;
|
||
|
|
||
|
if (!linepos[1] || !linepos[2]) {
|
||
|
*newcurrent = '&';
|
||
|
newcurrent++;
|
||
|
line = linepos+1;
|
||
|
continue;
|
||
|
}
|
||
|
switch (linepos[1]) {
|
||
|
case '#':
|
||
|
count = decode_html_char_ref(linepos+2,
|
||
|
newcurrent);
|
||
|
if (count > 0) {
|
||
|
newcurrent++;
|
||
|
linepos += count+2;
|
||
|
} else {
|
||
|
*newcurrent = '&';
|
||
|
newcurrent++;
|
||
|
linepos++;
|
||
|
}
|
||
|
break;
|
||
|
/* TODO: character entities, & etc. */
|
||
|
default:
|
||
|
*newcurrent = '&';
|
||
|
newcurrent++;
|
||
|
linepos++;
|
||
|
}
|
||
|
line = linepos;
|
||
|
}
|
||
|
*newcurrent = '\0';
|
||
|
return newline;
|
||
|
}
|
||
|
|
||
|
int char2hex(unsigned char c)
|
||
|
{
|
||
|
if ((c-'0') <= 9) {
|
||
|
return (c-'0');
|
||
|
} else if ((c-'A') <= 5) {
|
||
|
return (c-'A'+10);
|
||
|
}
|
||
|
return (c-'a'+10);
|
||
|
}
|
||
|
|
||
|
char *quoted_decode(unsigned char *line, off_t in_size)
|
||
|
{
|
||
|
unsigned char *newline, *newcurrent, *line_end;
|
||
|
|
||
|
newcurrent = newline = (unsigned char *) cli_malloc(in_size + 1);
|
||
|
if (!newline) {
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
line_end = line+in_size;
|
||
|
while (line <= line_end) {
|
||
|
while ((line < line_end) && *line != '=') {
|
||
|
*newcurrent = *line;
|
||
|
line++;
|
||
|
newcurrent++;
|
||
|
}
|
||
|
if ((line < line_end) && isspace(line[1])) {
|
||
|
line++;
|
||
|
while ((line < line_end) && isspace(*line)) {
|
||
|
line++;
|
||
|
}
|
||
|
continue;
|
||
|
}
|
||
|
if ((line+2) <= line_end) {
|
||
|
if (isxdigit(line[1]) && isxdigit(line[2])) {
|
||
|
*newcurrent = (char2hex(line[1]) * 16) +
|
||
|
char2hex(line[2]);
|
||
|
newcurrent++;
|
||
|
line += 3;
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
line++;
|
||
|
}
|
||
|
*newcurrent = '\0';
|
||
|
return newline;
|
||
|
}
|