190 lines
5.0 KiB
Perl
190 lines
5.0 KiB
Perl
|
#!/usr/bin/perl
|
||
|
#
|
||
|
|
||
|
use strict;
|
||
|
use S2::FilePos;
|
||
|
use S2::TokenPunct;
|
||
|
use S2::TokenWhitespace;
|
||
|
use S2::TokenIdent;
|
||
|
use S2::TokenIntegerLiteral;
|
||
|
use S2::TokenPunct;
|
||
|
use S2::TokenComment;
|
||
|
use S2::TokenStringLiteral;
|
||
|
|
||
|
package S2::Tokenizer;
|
||
|
|
||
|
sub new # (fh) class method
|
||
|
{
|
||
|
my ($class, $content) = @_;
|
||
|
|
||
|
my $this = {};
|
||
|
bless $this, $class;
|
||
|
|
||
|
if (ref $content eq "SCALAR") {
|
||
|
$this->{'content'} = $content;
|
||
|
$this->{'length'} = length $$content;
|
||
|
}
|
||
|
$this->{'pos'} = 0;
|
||
|
$this->{'line'} = 1;
|
||
|
$this->{'col'} = 1;
|
||
|
$this->{'inString'} = 0; # (accessed directly elsewhere)
|
||
|
$this->{'inStringStack'} = [];
|
||
|
$this->{'peekedToken'} = undef;
|
||
|
return $this;
|
||
|
}
|
||
|
|
||
|
sub pushInString {
|
||
|
my ($this, $val) = @_;
|
||
|
push @{$this->{'inStringStack'}}, $this->{'inString'};
|
||
|
$this->{'inString'} = $val;
|
||
|
#print STDERR "PUSH: $val Stack: @{$this->{'inStringStack'}}\n";
|
||
|
}
|
||
|
|
||
|
sub popInString {
|
||
|
my ($this) = @_;
|
||
|
my $was = $this->{'inString'};
|
||
|
$this->{'inString'} = pop @{$this->{'inStringStack'}};
|
||
|
#print STDERR "POP: $this->{'inString'} Stack: @{$this->{'inStringStack'}}\n";
|
||
|
if ($was != $this->{'inString'} && $this->{'peekedToken'}) {
|
||
|
# back tokenizer up and discard our peeked token
|
||
|
pos(${$this->{'content'}}) = $this->{'peekedToken'}->{'pos_re'};
|
||
|
$this->{'peekedToken'} = undef;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
sub peek # () method : Token
|
||
|
{
|
||
|
$_[0]->{'peekedToken'} ||= $_[0]->getToken(1);
|
||
|
}
|
||
|
|
||
|
sub getToken # () method : Token
|
||
|
{
|
||
|
my ($this, $just_peek) = @_;
|
||
|
|
||
|
# return peeked token if we have one
|
||
|
if (my $t = $this->{'peekedToken'}) {
|
||
|
$this->{'peekedToken'} = undef;
|
||
|
$this->moveLineCol($t) unless $just_peek;
|
||
|
return $t;
|
||
|
}
|
||
|
|
||
|
my $pos = $this->getPos();
|
||
|
my $pos_re = pos(${$this->{'content'}});
|
||
|
my $nxtoken = $this->makeToken();
|
||
|
if ($nxtoken) {
|
||
|
$nxtoken->{'pos'} = $pos;
|
||
|
$nxtoken->{'pos_re'} = $pos_re;
|
||
|
$this->moveLineCol($nxtoken) unless $just_peek;
|
||
|
}
|
||
|
# print STDERR "Token: ", $nxtoken->toString, "\n";
|
||
|
return $nxtoken;
|
||
|
}
|
||
|
|
||
|
sub getPos # () method : FilePos
|
||
|
{
|
||
|
return new S2::FilePos($_[0]->{'line'},
|
||
|
$_[0]->{'col'});
|
||
|
}
|
||
|
|
||
|
sub moveLineCol {
|
||
|
my ($this, $t) = @_;
|
||
|
if (my $newlines = ($t->{'chars'} =~ tr/\n/\n/)) {
|
||
|
# print STDERR "Chars: $t [$t->{'chars'}] Lines: $newlines\n";
|
||
|
$this->{'line'} += $newlines;
|
||
|
$t->{'chars'} =~ /\n(.+)$/m;
|
||
|
$this->{'col'} = 1 + length $1;
|
||
|
} else {
|
||
|
# print STDERR "Chars: $t [$t->{'chars'}]\n";
|
||
|
$this->{'col'} += length $t->{'chars'};
|
||
|
}
|
||
|
}
|
||
|
|
||
|
sub makeToken # () method private : Token
|
||
|
{
|
||
|
my $this = shift;
|
||
|
my $c = $this->{'content'};
|
||
|
|
||
|
# finishing or trying to finish an open quoted string
|
||
|
if ($this->{'inString'} == 1 &&
|
||
|
$$c =~ /\G((\\[\\\"\$]|[^\"\$])*)(\")?/sgc) {
|
||
|
my $source = $1;
|
||
|
my $closed = $3 ? 1 : 0;
|
||
|
return S2::TokenStringLiteral->new(undef, $source, 0, $closed);
|
||
|
}
|
||
|
|
||
|
# finishing a triple quoted string
|
||
|
if ($this->{'inString'} == 3) {
|
||
|
if ($$c =~ /\G((\\[\\\"\$]|[^\$])*?)\"\"\"/sgc) {
|
||
|
my $source = $1;
|
||
|
return S2::TokenStringLiteral->new(undef, $source, 0, 3);
|
||
|
}
|
||
|
|
||
|
# not finishing a triple quoted string (end in $)
|
||
|
if ($$c =~ /\G((\\[\\\"\$]|[^\$])*)/sgc) {
|
||
|
my $source = $1;
|
||
|
return S2::TokenStringLiteral->new(undef, $source, 0, 0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# not in a string, but one's starting
|
||
|
if ($this->{'inString'} == 0 && $$c =~ /\G\"/gc) {
|
||
|
# triple start and triple end
|
||
|
if ($$c =~ /\G\"\"((\\[\\\"\$]|[^\$])*?)\"\"\"/gc) {
|
||
|
my $source = $1;
|
||
|
return S2::TokenStringLiteral->new(undef, $source, 3, 3);
|
||
|
}
|
||
|
|
||
|
# triple start and variable end
|
||
|
if ($$c =~ /\G\"\"((\\[\\\"\$]|[^\$])*)/gc) {
|
||
|
my $source = $1;
|
||
|
return S2::TokenStringLiteral->new(undef, $source, 3, 0);
|
||
|
}
|
||
|
|
||
|
# single start and maybe end
|
||
|
if ($$c =~ /\G((\\[\\\"\$]|[^\"\$])*)(\")?/gc) {
|
||
|
my $source = $1;
|
||
|
my $closed = $3 ? 1 : 0;
|
||
|
return S2::TokenStringLiteral->new(undef, $source, 1, $closed);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ($$c =~ /\G\s+/gc) {
|
||
|
my $ws = $&;
|
||
|
return S2::TokenWhitespace->new($ws);
|
||
|
}
|
||
|
|
||
|
if ($$c =~ /\G(<=?|>=?|==|=>?|!=|\+\+?|->|--?|;|::?|&&?|\|\|?|\*|\/|%|!|\.\.?|\{|\}|\[|\]|\(|\)|,|\?|\$)/gc) {
|
||
|
return S2::TokenPunct->new($1);
|
||
|
}
|
||
|
|
||
|
if ($$c =~ /\G[a-zA-Z\_]\w*/gc) {
|
||
|
my $ident = $&;
|
||
|
return S2::TokenIdent->new($ident);
|
||
|
}
|
||
|
|
||
|
if ($$c =~ /\G(\d+)/gc) {
|
||
|
my $iv = $1;
|
||
|
return S2::TokenIntegerLiteral->new($iv);
|
||
|
}
|
||
|
|
||
|
if ($$c =~ /\G\#.*\n?/gc) {
|
||
|
return S2::TokenComment->new($&);
|
||
|
}
|
||
|
|
||
|
if ($$c =~ /.+/gc) {
|
||
|
S2::error($this->getPos(), "Parse error! Unknown token. ($&)");
|
||
|
}
|
||
|
|
||
|
return undef;
|
||
|
}
|
||
|
|
||
|
sub peekChar {
|
||
|
my $this = shift;
|
||
|
my $pos = pos(${$this->{'content'}});
|
||
|
my $ch = substr(${$this->{'content'}}, $pos, 1);
|
||
|
#print STDERR "pos = $pos, char = $ch\n";
|
||
|
return $ch;
|
||
|
}
|
||
|
|
||
|
1;
|