class Hermes::Parser

Parse a HTML file or string.

Constants

RE_ATTR
RE_BANG
RE_CDATA
RE_CMD
RE_COMMENT
RE_INSTR
RE_TAG
Tok

Attributes

list[R]

Public Class Methods

new(str, term = nil) click to toggle source
# File lib/hermes/tags.rb, line 42
def initialize str, term = nil
  @list = []
  s = str
  while s =~ /</ do
  add_data $`
    s = $'
    e = case s
      when %r{\A/\s*#{term}\s*>}im then
        nil
      when RE_TAG     then
        s = $'
        t =                Tok[ :tag,   $1.downcase, (attrs $2),
                                              (sub_parser s, $1, $3)]
        s =~ %r{\A}
        t
      when RE_INSTR   then Tok[ :instr, $1.downcase, (attrs $2), nil]
      when RE_COMMENT then Tok[ :comm,  nil,         nil,        $1 ]
      when RE_CDATA   then Tok[ nil,    nil,         nil,        $1 ]
      when RE_BANG    then Tok[ :bang,  $1,          (attrl $2), nil]
      when RE_CMD     then Tok[ :cmd,   $1,          nil,        nil]
      else
        raise Error, "Unclosed standalone tag <#{term}>."
    end
    s = $'
    e or break
    add_tok e
  end
  if term then
    str.replace s
  else
    add_data s
  end
end

Public Instance Methods

find_encoding() click to toggle source
# File lib/hermes/tags.rb, line 76
def find_encoding
  find_enc @list
end
pretty_print() click to toggle source
# File lib/hermes/tags.rb, line 80
def pretty_print
  puts_tree @list, 0
end

Private Instance Methods

add_data(str) click to toggle source
# File lib/hermes/tags.rb, line 90
def add_data str
  if str.notempty? then
    add_tok Tok[ nil, nil, nil, str]
  end
end
add_tok(tok) click to toggle source
# File lib/hermes/tags.rb, line 96
def add_tok tok
  if not tok.type and (l = @list.last) and not l.type then
    l.data << tok.data
  else
    @list.push tok
  end
end
attr_val(str) click to toggle source
# File lib/hermes/tags.rb, line 129
def attr_val str
  r = case str
    when /\A"(.*?)"/m then $1
    when /\A'(.*?)'/m then $1
    when /\A\S+/      then $&
  end
  str.replace $'
  str.lstrip!
  r
end
attrl(str) click to toggle source
# File lib/hermes/tags.rb, line 120
def attrl str
  a = []
  while str.notempty? do
    v = attr_val str
    a.push v
  end
  a
end
attrs(str) click to toggle source
# File lib/hermes/tags.rb, line 104
def attrs str
  a = {}
  while str.notempty? do
    str.slice! RE_ATTR or
      raise Error, "Illegal attribute specification: #{str}"
    k = $1.downcase
    a[ k] = if $2 then
      attr_val str
    else
      str.lstrip!
      k
    end
  end
  a
end
find_enc(p) click to toggle source
# File lib/hermes/tags.rb, line 140
def find_enc p
  p.each { |e|
    r = case e.type
      when :tag   then
        case e.tag
          when "html", "head" then
            find_enc e.data.list
          when "meta" then
            e.attrs[ "charset"] || (
              if e.attrs[ "http-equiv"] == "Content-Type" then
                require "hermes/contents"
                c = Contents.parse e.attrs[ "content"]
                c[ "charset"]
              end
            )
        end
      when :query then
        e.attrs[ "encoding"]
    end
    return r if r
  }
  nil
end
puts_tree(p, indent) click to toggle source
# File lib/hermes/tags.rb, line 164
def puts_tree p, indent
  p.each { |e|
    print "%s[%s] %s  " % [ "  "*indent, e.type, e.tag, ]
    r = case e.type
      when :tag   then puts ; puts_tree e.data.list, indent+1 if e.data
      when nil    then puts "%s%s" % [ "  "*(indent+1), e.data.inspect, ]
      else             puts
    end
  }
end
sub_parser(s, tag, close) click to toggle source
# File lib/hermes/tags.rb, line 86
def sub_parser s, tag, close
  self.class.new s, tag unless close
end