HEX

File: //usr/lib/ruby/site_ruby/1.8/puppet/parser/lexer.rb
# the scanner/lexer

require 'forwardable'
require 'strscan'
require 'puppet'
require 'puppet/util/methodhelper'


module Puppet
  class LexError < RuntimeError; end
end

module Puppet::Parser; end

class Puppet::Parser::Lexer
  extend Forwardable

  attr_reader :last, :file, :lexing_context, :token_queue

  attr_accessor :line, :indefine
  alias :indefine? :indefine

  # Returns the position on the line.
  # This implementation always returns nil. It is here for API reasons in Puppet::Error
  # which needs to support both --parser current, and --parser future.
  #
  def pos
    # Make the lexer comply with newer API. It does not produce a pos...
    nil
  end

  def lex_error msg
    raise Puppet::LexError.new(msg)
  end

  class Token
    ALWAYS_ACCEPTABLE = Proc.new { |context| true }

    include Puppet::Util::MethodHelper

    attr_accessor :regex, :name, :string, :skip, :incr_line, :skip_text, :accumulate
    alias skip? skip
    alias accumulate? accumulate

    def initialize(string_or_regex, name, options = {})
      if string_or_regex.is_a?(String)
        @name, @string = name, string_or_regex
        @regex = Regexp.new(Regexp.escape(string_or_regex))
      else
        @name, @regex = name, string_or_regex
      end

      set_options(options)
      @acceptable_when = ALWAYS_ACCEPTABLE
    end

    def to_s
      string or @name.to_s
    end

    def acceptable?(context={})
      @acceptable_when.call(context)
    end

    # Define when the token is able to match.
    # This provides context that cannot be expressed otherwise, such as feature flags.
    #
    # @param block [Proc] a proc that given a context returns a boolean
    def acceptable_when(block)
      @acceptable_when = block
    end
  end

  # Maintain a list of tokens.
  class TokenList
    extend Forwardable

    attr_reader :regex_tokens, :string_tokens
    def_delegator :@tokens, :[]

    # Create a new token.
    def add_token(name, regex, options = {}, &block)
      raise(ArgumentError, "Token #{name} already exists") if @tokens.include?(name)
      token = Token.new(regex, name, options)
      @tokens[token.name] = token
      if token.string
        @string_tokens << token
        @tokens_by_string[token.string] = token
      else
        @regex_tokens << token
      end

      token.meta_def(:convert, &block) if block_given?

      token
    end

    def initialize
      @tokens = {}
      @regex_tokens = []
      @string_tokens = []
      @tokens_by_string = {}
    end

    # Look up a token by its value, rather than name.
    def lookup(string)
      @tokens_by_string[string]
    end

    # Define more tokens.
    def add_tokens(hash)
      hash.each do |regex, name|
        add_token(name, regex)
      end
    end

    # Sort our tokens by length, so we know once we match, we're done.
    # This helps us avoid the O(n^2) nature of token matching.
    def sort_tokens
      @string_tokens.sort! { |a, b| b.string.length <=> a.string.length }
    end

    # Yield each token name and value in turn.
    def each
      @tokens.each {|name, value| yield name, value }
    end
  end

  TOKENS = TokenList.new
  TOKENS.add_tokens(
    '['   => :LBRACK,
    ']'   => :RBRACK,
    '{'   => :LBRACE,
    '}'   => :RBRACE,
    '('   => :LPAREN,
    ')'   => :RPAREN,
    '='   => :EQUALS,
    '+='  => :APPENDS,
    '=='  => :ISEQUAL,
    '>='  => :GREATEREQUAL,
    '>'   => :GREATERTHAN,
    '<'   => :LESSTHAN,
    '<='  => :LESSEQUAL,
    '!='  => :NOTEQUAL,
    '!'   => :NOT,
    ','   => :COMMA,
    '.'   => :DOT,
    ':'   => :COLON,
    '@'   => :AT,
    '<<|' => :LLCOLLECT,
    '|>>' => :RRCOLLECT,
    '->'  => :IN_EDGE,
    '<-'  => :OUT_EDGE,
    '~>'  => :IN_EDGE_SUB,
    '<~'  => :OUT_EDGE_SUB,
    '<|'  => :LCOLLECT,
    '|>'  => :RCOLLECT,
    ';'   => :SEMIC,
    '?'   => :QMARK,
    '\\'  => :BACKSLASH,
    '=>'  => :FARROW,
    '+>'  => :PARROW,
    '+'   => :PLUS,
    '-'   => :MINUS,
    '/'   => :DIV,
    '*'   => :TIMES,
    '%'   => :MODULO,
    '<<'  => :LSHIFT,
    '>>'  => :RSHIFT,
    '=~'  => :MATCH,
    '!~'  => :NOMATCH,
    %r{((::){0,1}[A-Z][-\w]*)+} => :CLASSREF,
    "<string>" => :STRING,
    "<dqstring up to first interpolation>" => :DQPRE,
    "<dqstring between two interpolations>" => :DQMID,
    "<dqstring after final interpolation>" => :DQPOST,
    "<boolean>" => :BOOLEAN
  )

  module Contextual
    QUOTE_TOKENS = [:DQPRE,:DQMID]
    REGEX_INTRODUCING_TOKENS = [:NODE,:LBRACE,:RBRACE,:MATCH,:NOMATCH,:COMMA]

    NOT_INSIDE_QUOTES = Proc.new do |context|
      !QUOTE_TOKENS.include? context[:after]
    end

    INSIDE_QUOTES = Proc.new do |context|
      QUOTE_TOKENS.include? context[:after]
    end

    IN_REGEX_POSITION = Proc.new do |context|
      REGEX_INTRODUCING_TOKENS.include? context[:after]
    end

    IN_STRING_INTERPOLATION = Proc.new do |context|
      context[:string_interpolation_depth] > 0
    end

    DASHED_VARIABLES_ALLOWED = Proc.new do |context|
      Puppet[:allow_variables_with_dashes]
    end

    VARIABLE_AND_DASHES_ALLOWED = Proc.new do |context|
      Contextual::DASHED_VARIABLES_ALLOWED.call(context) and TOKENS[:VARIABLE].acceptable?(context)
    end
  end

  # Numbers are treated separately from names, so that they may contain dots.
  TOKENS.add_token :NUMBER, %r{\b(?:0[xX][0-9A-Fa-f]+|0?\d+(?:\.\d+)?(?:[eE]-?\d+)?)\b} do |lexer, value|
    [TOKENS[:NAME], value]
  end
  TOKENS[:NUMBER].acceptable_when Contextual::NOT_INSIDE_QUOTES

  TOKENS.add_token :NAME, %r{((::)?[a-z0-9][-\w]*)(::[a-z0-9][-\w]*)*} do |lexer, value|
    string_token = self
    # we're looking for keywords here
    if tmp = KEYWORDS.lookup(value)
      string_token = tmp
      if [:TRUE, :FALSE].include?(string_token.name)
        value = eval(value)
        string_token = TOKENS[:BOOLEAN]
      end
    end
    [string_token, value]
  end
  [:NAME, :CLASSREF].each do |name_token|
    TOKENS[name_token].acceptable_when Contextual::NOT_INSIDE_QUOTES
  end

  TOKENS.add_token :COMMENT, %r{#.*}, :accumulate => true, :skip => true do |lexer,value|
    value.sub!(/# ?/,'')
    [self, value]
  end

  TOKENS.add_token :MLCOMMENT, %r{/\*(.*?)\*/}m, :accumulate => true, :skip => true do |lexer, value|
    lexer.line += value.count("\n")
    value.sub!(/^\/\* ?/,'')
    value.sub!(/ ?\*\/$/,'')
    [self,value]
  end

  TOKENS.add_token :REGEX, %r{/[^/\n]*/} do |lexer, value|
    # Make sure we haven't matched an escaped /
    while value[-2..-2] == '\\'
      other = lexer.scan_until(%r{/})
      value += other
    end
    regex = value.sub(%r{\A/}, "").sub(%r{/\Z}, '').gsub("\\/", "/")
    [self, Regexp.new(regex)]
  end
  TOKENS[:REGEX].acceptable_when Contextual::IN_REGEX_POSITION

  TOKENS.add_token :RETURN, "\n", :skip => true, :incr_line => true, :skip_text => true

  TOKENS.add_token :SQUOTE, "'" do |lexer, value|
    [TOKENS[:STRING], lexer.slurpstring(value,["'"],:ignore_invalid_escapes).first ]
  end

  DQ_initial_token_types      = {'$' => :DQPRE,'"' => :STRING}
  DQ_continuation_token_types = {'$' => :DQMID,'"' => :DQPOST}

  TOKENS.add_token :DQUOTE, /"/ do |lexer, value|
    lexer.tokenize_interpolated_string(DQ_initial_token_types)
  end

  TOKENS.add_token :DQCONT, /\}/ do |lexer, value|
    lexer.tokenize_interpolated_string(DQ_continuation_token_types)
  end
  TOKENS[:DQCONT].acceptable_when Contextual::IN_STRING_INTERPOLATION

  TOKENS.add_token :DOLLAR_VAR_WITH_DASH, %r{\$(?:::)?(?:[-\w]+::)*[-\w]+} do |lexer, value|
    lexer.warn_if_variable_has_hyphen(value)

    [TOKENS[:VARIABLE], value[1..-1]]
  end
  TOKENS[:DOLLAR_VAR_WITH_DASH].acceptable_when Contextual::DASHED_VARIABLES_ALLOWED

  TOKENS.add_token :DOLLAR_VAR, %r{\$(::)?(\w+::)*\w+} do |lexer, value|
    [TOKENS[:VARIABLE],value[1..-1]]
  end

  TOKENS.add_token :VARIABLE_WITH_DASH, %r{(?:::)?(?:[-\w]+::)*[-\w]+} do |lexer, value|
    lexer.warn_if_variable_has_hyphen(value)

    [TOKENS[:VARIABLE], value]
  end
  TOKENS[:VARIABLE_WITH_DASH].acceptable_when Contextual::VARIABLE_AND_DASHES_ALLOWED

  TOKENS.add_token :VARIABLE, %r{(::)?(\w+::)*\w+}
  TOKENS[:VARIABLE].acceptable_when Contextual::INSIDE_QUOTES

  TOKENS.sort_tokens

  @@pairs = {
    "{"   => "}",
    "("   => ")",
    "["   => "]",
    "<|"  => "|>",
    "<<|" => "|>>"
  }

  KEYWORDS = TokenList.new
  KEYWORDS.add_tokens(
    "case"     => :CASE,
    "class"    => :CLASS,
    "default"  => :DEFAULT,
    "define"   => :DEFINE,
    "import"   => :IMPORT,
    "if"       => :IF,
    "elsif"    => :ELSIF,
    "else"     => :ELSE,
    "inherits" => :INHERITS,
    "node"     => :NODE,
    "and"      => :AND,
    "or"       => :OR,
    "undef"    => :UNDEF,
    "false"    => :FALSE,
    "true"     => :TRUE,
    "in"       => :IN,
    "unless"   => :UNLESS
  )

  def clear
    initvars
  end

  def expected
    return nil if @expected.empty?
    name = @expected[-1]
    TOKENS.lookup(name) or lex_error "Could not find expected token #{name}"
  end

  # scan the whole file
  # basically just used for testing
  def fullscan
    array = []

    self.scan { |token, str|
      # Ignore any definition nesting problems
      @indefine = false
      array.push([token,str])
    }
    array
  end

  def file=(file)
    @file = file
    @line = 1
    contents = Puppet::FileSystem.exist?(file) ? Puppet::FileSystem.read(file) : ""
    @scanner = StringScanner.new(contents)
  end

  def_delegator :@token_queue, :shift, :shift_token

  def find_string_token
    # We know our longest string token is three chars, so try each size in turn
    # until we either match or run out of chars.  This way our worst-case is three
    # tries, where it is otherwise the number of string token we have.  Also,
    # the lookups are optimized hash lookups, instead of regex scans.
    #
    s = @scanner.peek(3)
    token = TOKENS.lookup(s[0,3]) || TOKENS.lookup(s[0,2]) || TOKENS.lookup(s[0,1])
    [ token, token && @scanner.scan(token.regex) ]
  end

  # Find the next token that matches a regex.  We look for these first.
  def find_regex_token
    best_token = nil
    best_length = 0

    # I tried optimizing based on the first char, but it had
    # a slightly negative affect and was a good bit more complicated.
    TOKENS.regex_tokens.each do |token|
      if length = @scanner.match?(token.regex) and token.acceptable?(lexing_context)
        # We've found a longer match
        if length > best_length
          best_length = length
          best_token = token
        end
      end
    end

    return best_token, @scanner.scan(best_token.regex) if best_token
  end

  # Find the next token, returning the string and the token.
  def find_token
    shift_token || find_regex_token || find_string_token
  end

  def initialize
    initvars
  end

  def initvars
    @line = 1
    @previous_token = nil
    @scanner = nil
    @file = nil
    # AAARRGGGG! okay, regexes in ruby are bloody annoying
    # no one else has "\n" =~ /\s/
    @skip = %r{[ \t\r]+}

    @namestack = []
    @token_queue = []
    @indefine = false
    @expected = []
    @commentstack = [ ['', @line] ]
    @lexing_context = {
      :after => nil,
      :start_of_line => true,
      :string_interpolation_depth => 0
      }
  end

  # Make any necessary changes to the token and/or value.
  def munge_token(token, value)
    @line += 1 if token.incr_line

    skip if token.skip_text

    return if token.skip and not token.accumulate?

    token, value = token.convert(self, value) if token.respond_to?(:convert)

    return unless token

    if token.accumulate?
      comment = @commentstack.pop
      comment[0] << value + "\n"
      @commentstack.push(comment)
    end

    return if token.skip

    return token, { :value => value, :line => @line }
  end

  # Handling the namespace stack
  def_delegator :@namestack, :pop, :namepop
  # This value might have :: in it, but we don't care -- it'll be handled
  # normally when joining, and when popping we want to pop this full value,
  # however long the namespace is.
  def_delegator :@namestack, :<<, :namestack

  # Collect the current namespace.
  def namespace
    @namestack.join("::")
  end

  def_delegator :@scanner, :rest

  # this is the heart of the lexer
  def scan
    #Puppet.debug("entering scan")
    lex_error "Invalid or empty string" unless @scanner

    # Skip any initial whitespace.
    skip

    until token_queue.empty? and @scanner.eos? do
      matched_token, value = find_token

      # error out if we didn't match anything at all
      lex_error "Could not match #{@scanner.rest[/^(\S+|\s+|.*)/]}" unless matched_token

      newline = matched_token.name == :RETURN

      # this matches a blank line; eat the previously accumulated comments
      getcomment if lexing_context[:start_of_line] and newline
      lexing_context[:start_of_line] = newline

      final_token, token_value = munge_token(matched_token, value)

      unless final_token
        skip
        next
      end

      final_token_name = final_token.name
      lexing_context[:after]         = final_token_name unless newline
      lexing_context[:string_interpolation_depth] += 1 if final_token_name == :DQPRE
      lexing_context[:string_interpolation_depth] -= 1 if final_token_name == :DQPOST

      value = token_value[:value]

      if match = @@pairs[value] and final_token_name != :DQUOTE and final_token_name != :SQUOTE
        @expected << match
      elsif exp = @expected[-1] and exp == value and final_token_name != :DQUOTE and final_token_name != :SQUOTE
        @expected.pop
      end

      if final_token_name == :LBRACE or final_token_name == :LPAREN
        commentpush
      end
      if final_token_name == :RPAREN
        commentpop
      end

      yield [final_token_name, token_value]

      if @previous_token
        namestack(value) if @previous_token.name == :CLASS and value != '{'

        if @previous_token.name == :DEFINE
          if indefine?
            msg = "Cannot nest definition #{value} inside #{@indefine}"
            self.indefine = false
            raise Puppet::ParseError, msg
          end

          @indefine = value
        end
      end
      @previous_token = final_token
      skip
    end
    @scanner = nil

    # This indicates that we're done parsing.
    yield [false,false]
  end

  # Skip any skipchars in our remaining string.
  def skip
    @scanner.skip(@skip)
  end

  # Provide some limited access to the scanner, for those
  # tokens that need it.
  def_delegator :@scanner, :scan_until

  # we've encountered the start of a string...
  # slurp in the rest of the string and return it
  def slurpstring(terminators,escapes=%w{ \\  $ ' " r n t s }+["\n"],ignore_invalid_escapes=false)
    # we search for the next quote that isn't preceded by a
    # backslash; the caret is there to match empty strings
    str = @scanner.scan_until(/([^\\]|^|[^\\])([\\]{2})*[#{terminators}]/) or lex_error "Unclosed quote after '#{last}' in '#{rest}'"
    @line += str.count("\n") # literal carriage returns add to the line count.
    str.gsub!(/\\(.)/m) {
      ch = $1
      if escapes.include? ch
        case ch
        when 'r'; "\r"
        when 'n'; "\n"
        when 't'; "\t"
        when 's'; " "
        when "\n"; ''
        else      ch
        end
      else
        Puppet.warning "Unrecognised escape sequence '\\#{ch}'#{file && " in file #{file}"}#{line && " at line #{line}"}" unless ignore_invalid_escapes
        "\\#{ch}"
      end
    }
    [ str[0..-2],str[-1,1] ]
  end

  def tokenize_interpolated_string(token_type,preamble='')
    value,terminator = slurpstring('"$')
    token_queue << [TOKENS[token_type[terminator]],preamble+value]
    variable_regex = if Puppet[:allow_variables_with_dashes]
                       TOKENS[:VARIABLE_WITH_DASH].regex
                     else
                       TOKENS[:VARIABLE].regex
                     end
    if terminator != '$' or @scanner.scan(/\{/)
      token_queue.shift
    elsif var_name = @scanner.scan(variable_regex)
      warn_if_variable_has_hyphen(var_name)
      token_queue << [TOKENS[:VARIABLE],var_name]
      tokenize_interpolated_string(DQ_continuation_token_types)
    else
      tokenize_interpolated_string(token_type,token_queue.pop.last + terminator)
    end
  end

  # just parse a string, not a whole file
  def string=(string)
    @scanner = StringScanner.new(string)
  end

  # returns the content of the currently accumulated content cache
  def commentpop
    @commentstack.pop[0]
  end

  def getcomment(line = nil)
    comment = @commentstack.last
    if line.nil? or comment[1] <= line
      @commentstack.pop
      @commentstack.push(['', @line])
      return comment[0]
    end
    ''
  end

  def commentpush
    @commentstack.push(['', @line])
  end

  def warn_if_variable_has_hyphen(var_name)
    if var_name.include?('-')
      Puppet.deprecation_warning("Using `-` in variable names is deprecated at #{file || '<string>'}:#{line}. See http://links.puppetlabs.com/puppet-hyphenated-variable-deprecation")
    end
  end
end