# --------------------------------------------------------------------------- # This software is provided as-is. # You may use it for commercial or personal use. # If you distribute it, please keep this notice intact. # # Original Copyright (c) 2002 Brad Choate # --------------------------------------------------------------------------- # Modifications and integration Copyright 2002-2003 Six Apart. # # $Id: Sanitize.pm,v 1.3 2003/02/16 08:17:08 btrott Exp $ package MT::Sanitize; use strict; use MT::ConfigMgr; sub parse_spec { my $class = shift; my($a) = @_; my(%ok_tags, %tag_attr); for my $rule (split /\s*,\s*/, $a) { my(%ok_attr, $tag, $style); $tag = lc $rule; if ($tag =~ m|^([^\s]+)\s+(.+)$|) { ($tag, my($attrs)) = ($1, $2); $style = $1 if $tag =~ s|(/)$||; %ok_attr = map { $_ => 1 } split /\s+/, $attrs; } else { $style = $1 if $tag =~ s|(/)$||; } $tag_attr{$tag} = $style if $style; $ok_tags{$tag} = keys %ok_attr ? \%ok_attr : 1; } { ok => \%ok_tags, tag_attr => \%tag_attr } } my %Strip = ( '%' => '%>', '?' => '?>', '!--' => '-->', ); my $Strip_RE = join '|', map quotemeta($_), keys %Strip; sub sanitize { my $class = shift; my($s, $arg) = @_; unless (!$arg || ref($arg)) { $arg = $class->parse_spec($arg); } my $ok_tags = $arg->{ok}; my $tag_attr = $arg->{tag_attr}; my(@open_tags, %open_tags); my $out = ''; my $start = 0; my $last_pos = 0; my $seq = 0; while ($s =~ m|<|gs) { my $start = (pos $s) - 1; my $inside = substr($s, $start + 1, 5); # 5 chars should do it... if ($last_pos < $start) { $out .= substr($s, $last_pos, $start - $last_pos); } if ($inside =~ m|^($Strip_RE)|) { my $tag = $1; pos $s = $start + length($tag) + 1; if ($s =~ m|\Q$Strip{$tag}\E|gs) { $last_pos = pos $s; } else { # can't find the closure. $last_pos = pos $s; last; } } elsif ($inside =~ m|^>|) { $last_pos = $start + 1; } elsif ($inside =~ m|^<|) { $last_pos = $start + 1; } elsif ($s =~ m|(.+?)>|gs) { $inside = $1; my $end = pos $s; my $name; my $closure = 0; if ($inside =~ m/^([^ ]+) (.+)$/s) { $name = lc($1); $inside = $2; } else { $name = lc($inside); $inside = ''; } if ($name =~ m|^/|) { $name =~ s|^/||; $closure = 1; } if ($inside =~ m|/$|s) { $inside =~ s|/$||s; $closure = 2; } if ($ok_tags->{$name} || (exists $tag_attr->{$name} && $tag_attr->{$name} eq '/')) { if ($inside) { my @attrs; while ($inside =~ m/(\w+)\s*=\s*(['"])(.*?)\2/gs) { my $att = lc($1); if ($ok_tags->{'*'}{$att} || (ref $ok_tags->{$name} && $ok_tags->{$name}{$att})) { push @attrs, "$1=$2$3$2"; } } if (@attrs) { $inside = join ' ', @attrs; } else { $inside = ''; } } if (exists $tag_attr->{$name} && ($tag_attr->{$name} eq '/')) { $closure = 2; } if ($closure != 1 || ($closure == 1 && $open_tags{$name})) { if ($closure == 1) { $out .= _expel_up_to(\@open_tags, \%open_tags, $name); } elsif (!$closure) { ###if (@open_tags && ### ($open_tags{$name}) && ### (exists $tag_attr{$name}) && ### ($tag_attr{$name} eq '!')) { ### # unnestable tag-- expel stuff ### $out .= _expel_up_to(\@open_tags, \%open_tags, $name); ### $out .= ''; ###} push @open_tags, $name; $open_tags{$name}++; } $out .= '<'.($closure==1?'/':''). $name. ($inside ? ' '.$inside : ''). ($closure==2?' /':'').'>'; if ($closure == 1) { $open_tags{$name}--; } } } $last_pos = $end; } else { $last_pos = pos $s; last; } } if (defined $last_pos && ($last_pos < length($s))) { $out .= substr($s, $last_pos); } for my $tag (@open_tags) { $out .= ''; } $out; } sub _expel_up_to { my ($open_tags, $open_hash, $stop_tag) = @_; my $out = ''; while (@$open_tags && (!defined $stop_tag || $open_tags->[$#$open_tags] ne $stop_tag)) { my $t = pop @$open_tags; $open_hash->{$t}--; $out .= ''; } if (@$open_tags) { my $t = pop @$open_tags; } $out; } 1; __END__ =head1 NAME MT::Sanitize - Sanitize HTML for Safety =head1 SYNOPSIS use MT::Sanitive; my $html = <foolink HTML my $str = MT::Sanitize->sanitize($html, 'a href'); ## $str is now foolink =cut