Skip to content

Commit

Permalink
Initial commit for parser
Browse files Browse the repository at this point in the history
  • Loading branch information
maximecb committed Feb 7, 2024
1 parent 7acb99b commit 98e3c6c
Showing 1 changed file with 283 additions and 0 deletions.
283 changes: 283 additions & 0 deletions lib/protobuff/parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
# Parser for the protobuf (proto3) language:
# https://protobuf.dev/programming-guides/proto3/

# Note: Google has a unittest.proto test file
# We should aim to be able to parse it
# https://github.com/protocolbuffers/protoscope/blob/main/testdata/unittest.proto

# Notes about protobuf language (these will be removed later):
# - There are // style single-line comments
# - Messages have a list of fields
# - Fields have field numbers
# - Fields can be optional, repeated, map (key/value pairs)
# - Fields can have a default value
# - There are enums, which are integer types
# - Enum variants have integer values (is this optional?)
# - There must be a zero value (the default value)
# - Multiple different enums can share the same value if you set "option allow_alias = true;"
# - Enumerator constants must be in the range of a 32-bit integer.
# enum Foo {
# reserved 2, 15, 9 to 11, 40 to max;
# reserved "FOO", "BAR";
# }
# - There's also oneof, which is a kind of union/enum
# - You can import definitions
# import "myproject/other_protos.proto";

# Represents an input string/file
# Works as a tokenizer
class Input
def initialize(src)
@src = src
@cur_idx = 0
end

# Check if we're at the end of the input
def eof?
return @cur_idx >= @src.size
end

# Check if the input start with a given string
def start_with?(str)
return @src[@cur_idx...(@cur_idx + str.size)] == str
end

# Check if the input matches a given string/keyword
# If there is a match, consume the string
def match(str)
eat_ws
if start_with?(str)
@cur_idx += str.size
true
else
false
end
end

# Raise an exception if we can't match a specific string
def expect(str)
if !match(str)
raise "expected \"#{str}\""
end
end

# Peek at the next input character
def peek_ch()
@src[@cur_idx]
end

# Consume one character from the input
def eat_ch()
ch = @src[@cur_idx]
@cur_idx += 1
ch
end

# Consume whitespace
def eat_ws()
loop do
if eof?
break
end

ch = peek_ch

if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
eat_ch
else
break
end
end
end

# Read an identifier/name, eg:
# foo
# bar_bif
# foo123
def read_ident
name = ''

loop do
if eof?
break
end

ch = peek_ch

if !ch.match(/[A-Za-z0-9_]/)
break
end

name << eat_ch
end

if name.size == 0
raise "expected variable name"
end

return name
end

# Read a string constant, eg:
# 'foobar'
def read_string
str = ''

# The string must start with an opening quote
expect '"'

loop do
if eof?
raise "unexpected end of input"
end

# End of string
if match("\"")
break
end

# TODO: more complete support for string escaping
if match("\\'")
str << "\'"
continue
end

str << eat_ch
end

return str
end

# Read an integer constant
def read_int
value = 0

loop do
if eof?
raise "unexpected end of input"
end

ch = peek_ch

if ch < '0' || ch > '9'
break
end

# Decimal digit value
digit = ch.getbyte(0) - 0x30
value = 10 * value + digit
eat_ch
end

value
end
end

Unit = Struct.new(:messages)

Message = Struct.new(:name, :fields)

# TODO: repeated fields
Field = Struct.new(:optional, :name, :number)

# Parse an entire source unit (e.g. input file)
def parse_unit(input)
messages = []

loop do
input.eat_ws

if input.eof?
break
end

ident = input.read_ident

# Syntax mode
if ident == "syntax"
input.eat_ws
mode = input.read_string
input.expect ';'
if mode != "proto3"
raise "syntax mode must be proto3"
end
end

# Message definition
if ident == "message"
parse_message(input)
end
end

Unit.new(messages)
end

def parse_string(str)
parse_unit(Input.new(str))
end

def parse_file(name)
raise "TODO"
end

# Parse a message definition
def parse_message(input)
fields = []

input.eat_ws
name = input.read_ident
input.expect '{'

loop do
if input.match '}'
break
end

optional = false
if input.match 'optional'
optional = true
end

# Field type and name
input.eat_ws
type = input.read_ident
input.eat_ws
name = input.read_ident
input.expect '='
input.eat_ws
number = input.read_int
input.expect ';'

if number < 0 || number > 0xFF_FF_FF_FF
raise "field number should be in uint32 range"
end

fields << Field.new(optional, name, number)
end

# TODO: determine if an empty message with no fields is valid protobuf

Message.new(name, fields)
end

parse_string('')
parse_string('syntax "proto3";')
parse_string('syntax "proto3"; message Foo {}')
parse_string('message Test1 { optional int32 a = 1; }')
parse_string('message TestMessage { string id = 1; uint64 shop_id = 2; bool boolean = 3; }')






# TODO: write some tests, try to parse:
# ./test/fixtures/test.proto








0 comments on commit 98e3c6c

Please sign in to comment.