Más rapidito

Posted: July 1st, 2009 | Author: FreedomCoder | Filed under: Open Source, Programming, how-to | Tags: , , , | No Comments »

Como les estuve contando, sigo escribiendo mi wiki. Ya parsea un subconjunto interesante del lenguaje definido por trac.
Siguiendo la tradición, les cuento como está avanzando el tokenizer. Al tokenizer lo simplifiqué para que devuelva la expresión regular que matcheo junto con el match (en vez del “tipo”). Esto hizo que la interfase para definir las reglas para tokenizar sea más simple. Si no hay ninguna regla que matchee sigue devolviendo ["string", :text].
Sin más, acá el código:

module Rapidito
  class Tokenizer

    def initialize( *delimiters )
      @delimiter_list = delimiters +  [/\z/]
      @match_cache = nil
    end

    def source
      valid_cache? ? @match_cache[0].to_s + @source : @source
    end

    def source=(s)
      @match_cache = nil
      @source = s
    end

    def has_next?
      !@source.empty? || valid_cache?
    end

    def valid_cache?
      (!@match_cache.nil?) && (@match_cache[0].to_s.length > 0)
    end

    def next_match
      @delimiter_list.map {|regex| [regex.match(@source),regex]}.reject {|p| p[0].nil?}.inject do
        |better,new|
        better_pos = better[0].pre_match.length
        new_pos = new[0].pre_match.length

        if better_pos < new_pos
          better
        elsif new_pos < better_pos
          new
        elsif better[0].to_s.length > new[0].to_s.length
          better
        else
          new
        end
      end
    end

    def next_token
      if @match_cache #cached delimiter
        rv = @match_cache
        @match_cache = nil
        return rv
      end

      match = next_match
      p = match[0].pre_match.length
      @source = @source[p + match[0].to_s.length, @source.length]

      if p == 0 #delimiter
        match
      else #text
        @match_cache = match
        [match[0].pre_match, :text]
      end
    end

    def all_tokens
      tokens = []
      while has_next?
        tokens << next_token
      end
      tokens
    end
  end
end

Y si miran los tests de unidad, van a ver que también quedaron más lindos:

require 'test/unit'
require 'rapidito/tokenizer'

include Rapidito

class TokenizerTest < Test::Unit::TestCase

  def test_no_token
    tok = Tokenizer.new
    tok.source = "aaaa"
    assert_equal true, tok.has_next?
    assert_equal ["aaaa", :text], tok.next_token
    assert_equal false, tok.has_next?
  end

  def assert_all_tokens( expected, tokenizer )
    assert_equal expected,
      tokenizer.all_tokens.map { |token, kind| [token.to_s, kind] }
  end

  def test_two_delimiters
    tok = Tokenizer.new(
      /\|/, /;;/
    )

    tok.source = "aa|bbb;;;;cccc"
    assert_all_tokens \
      [ ["aa", :text], ["|", /\|/], ["bbb", :text],
        [";;", /;;/], [";;", /;;/], ["cccc", :text] ],
      tok

    tok.source = "aa;;bbb||cccc"
    assert_all_tokens \
      [ ["aa", :text], [";;", /;;/], ["bbb", :text],
        ["|", /\|/], ["|", /\|/], ["cccc", :text] ],
      tok
  end

  def test_choose_longest_match
    tok = Tokenizer.new(
      /aa/, /aaa/
    )
    tok.source = "aaaa"
    assert_all_tokens [ ["aaa", /aaa/], ["a", :text ] ], tok
  end

  def test_reset_precache
    tok = Tokenizer.new(
      /\|/, /,/
    )
    tok.source = "original start|original end"
    tok.next_token
    tok.source = "new start,new end"
    assert_equal ["new start", :text], tok.next_token
  end

  def test_almost_finished
    tok = Tokenizer.new( /!/ )
    tok.source = "bang!"
    tok.next_token
    assert_equal true, tok.has_next?
    tok.next_token
    assert_equal false, tok.has_next?
  end

  def test_carriage_return_ending
    tok = Tokenizer.new( /!/ )
    tok.source = "bang!\n"
    tok.next_token
    assert_equal true, tok.has_next?
    tok.next_token
    assert_equal true, tok.has_next?
    assert_equal "\n", tok.next_token[0].to_s
    assert_equal false, tok.has_next?
  end

  def test_transparent_caching
    tok = Tokenizer.new( /!/ )
    tok.source = "bang!pum"
    tok.next_token

    assert_equal "!pum", tok.source
  end

  def test_match_klass
    tok = Tokenizer.new( /!/ )
    tok.source = "!bang!pum"

    assert_equal \
      [MatchData, String, MatchData, String],
      tok.all_tokens.map { |tok, kind| tok.class }
  end
end

Happy hacking,
Aureliano.

(Via aurelianito.) Original Link: Más rapidito



Leave a Reply

  • Powered by WP Hashcash