ljr/wcmtools/s2/S2/Tokenizer.pm

190 lines
5.0 KiB
Perl
Executable File

#!/usr/bin/perl
#
use strict;
use S2::FilePos;
use S2::TokenPunct;
use S2::TokenWhitespace;
use S2::TokenIdent;
use S2::TokenIntegerLiteral;
use S2::TokenPunct;
use S2::TokenComment;
use S2::TokenStringLiteral;
package S2::Tokenizer;
sub new # (fh) class method
{
my ($class, $content) = @_;
my $this = {};
bless $this, $class;
if (ref $content eq "SCALAR") {
$this->{'content'} = $content;
$this->{'length'} = length $$content;
}
$this->{'pos'} = 0;
$this->{'line'} = 1;
$this->{'col'} = 1;
$this->{'inString'} = 0; # (accessed directly elsewhere)
$this->{'inStringStack'} = [];
$this->{'peekedToken'} = undef;
return $this;
}
sub pushInString {
my ($this, $val) = @_;
push @{$this->{'inStringStack'}}, $this->{'inString'};
$this->{'inString'} = $val;
#print STDERR "PUSH: $val Stack: @{$this->{'inStringStack'}}\n";
}
sub popInString {
my ($this) = @_;
my $was = $this->{'inString'};
$this->{'inString'} = pop @{$this->{'inStringStack'}};
#print STDERR "POP: $this->{'inString'} Stack: @{$this->{'inStringStack'}}\n";
if ($was != $this->{'inString'} && $this->{'peekedToken'}) {
# back tokenizer up and discard our peeked token
pos(${$this->{'content'}}) = $this->{'peekedToken'}->{'pos_re'};
$this->{'peekedToken'} = undef;
}
}
sub peek # () method : Token
{
$_[0]->{'peekedToken'} ||= $_[0]->getToken(1);
}
sub getToken # () method : Token
{
my ($this, $just_peek) = @_;
# return peeked token if we have one
if (my $t = $this->{'peekedToken'}) {
$this->{'peekedToken'} = undef;
$this->moveLineCol($t) unless $just_peek;
return $t;
}
my $pos = $this->getPos();
my $pos_re = pos(${$this->{'content'}});
my $nxtoken = $this->makeToken();
if ($nxtoken) {
$nxtoken->{'pos'} = $pos;
$nxtoken->{'pos_re'} = $pos_re;
$this->moveLineCol($nxtoken) unless $just_peek;
}
# print STDERR "Token: ", $nxtoken->toString, "\n";
return $nxtoken;
}
sub getPos # () method : FilePos
{
return new S2::FilePos($_[0]->{'line'},
$_[0]->{'col'});
}
sub moveLineCol {
my ($this, $t) = @_;
if (my $newlines = ($t->{'chars'} =~ tr/\n/\n/)) {
# print STDERR "Chars: $t [$t->{'chars'}] Lines: $newlines\n";
$this->{'line'} += $newlines;
$t->{'chars'} =~ /\n(.+)$/m;
$this->{'col'} = 1 + length $1;
} else {
# print STDERR "Chars: $t [$t->{'chars'}]\n";
$this->{'col'} += length $t->{'chars'};
}
}
sub makeToken # () method private : Token
{
my $this = shift;
my $c = $this->{'content'};
# finishing or trying to finish an open quoted string
if ($this->{'inString'} == 1 &&
$$c =~ /\G((\\[\\\"\$]|[^\"\$])*)(\")?/sgc) {
my $source = $1;
my $closed = $3 ? 1 : 0;
return S2::TokenStringLiteral->new(undef, $source, 0, $closed);
}
# finishing a triple quoted string
if ($this->{'inString'} == 3) {
if ($$c =~ /\G((\\[\\\"\$]|[^\$])*?)\"\"\"/sgc) {
my $source = $1;
return S2::TokenStringLiteral->new(undef, $source, 0, 3);
}
# not finishing a triple quoted string (end in $)
if ($$c =~ /\G((\\[\\\"\$]|[^\$])*)/sgc) {
my $source = $1;
return S2::TokenStringLiteral->new(undef, $source, 0, 0);
}
}
# not in a string, but one's starting
if ($this->{'inString'} == 0 && $$c =~ /\G\"/gc) {
# triple start and triple end
if ($$c =~ /\G\"\"((\\[\\\"\$]|[^\$])*?)\"\"\"/gc) {
my $source = $1;
return S2::TokenStringLiteral->new(undef, $source, 3, 3);
}
# triple start and variable end
if ($$c =~ /\G\"\"((\\[\\\"\$]|[^\$])*)/gc) {
my $source = $1;
return S2::TokenStringLiteral->new(undef, $source, 3, 0);
}
# single start and maybe end
if ($$c =~ /\G((\\[\\\"\$]|[^\"\$])*)(\")?/gc) {
my $source = $1;
my $closed = $3 ? 1 : 0;
return S2::TokenStringLiteral->new(undef, $source, 1, $closed);
}
}
if ($$c =~ /\G\s+/gc) {
my $ws = $&;
return S2::TokenWhitespace->new($ws);
}
if ($$c =~ /\G(<=?|>=?|==|=>?|!=|\+\+?|->|--?|;|::?|&&?|\|\|?|\*|\/|%|!|\.\.?|\{|\}|\[|\]|\(|\)|,|\?|\$)/gc) {
return S2::TokenPunct->new($1);
}
if ($$c =~ /\G[a-zA-Z\_]\w*/gc) {
my $ident = $&;
return S2::TokenIdent->new($ident);
}
if ($$c =~ /\G(\d+)/gc) {
my $iv = $1;
return S2::TokenIntegerLiteral->new($iv);
}
if ($$c =~ /\G\#.*\n?/gc) {
return S2::TokenComment->new($&);
}
if ($$c =~ /.+/gc) {
S2::error($this->getPos(), "Parse error! Unknown token. ($&)");
}
return undef;
}
sub peekChar {
my $this = shift;
my $pos = pos(${$this->{'content'}});
my $ch = substr(${$this->{'content'}}, $pos, 1);
#print STDERR "pos = $pos, char = $ch\n";
return $ch;
}
1;