more grammars

meta-introspector · Feb 21, 2024 · 57f8668 · 57f8668
1 parent cbf2281
commit 57f8668
Show file tree

Hide file tree

Showing 9 changed files with 190 additions and 5 deletions.
diff --git a/chess.sh b/chess.sh
@@ -1,2 +1,2 @@
-GRAMMAR=`cat /mnt/data1/time2/time/2023/11/13/llama.cpp/grammars/chess.gbnf`
+GRAMMAR=`cat ./grammars/chess.gbnf`
 dune exec ./bin/simple_grammar.exe -- --llamacpp -s test4 -u "http://localhost:8080" -p "consider a consecutive series of types to describe the universe and universe of universes, what is your ordering?" -n 4 -g "${GRAMMAR}"
diff --git a/clang.sh b/clang.sh
@@ -1,2 +1,2 @@
-GRAMMAR=`cat /mnt/data1/time2/time/2023/11/13/llama.cpp/grammars/c.gbnf`
+GRAMMAR=`cat ./grammars/c.gbnf`
 dune exec ./bin/simple_grammar.exe -- --llamacpp -s clang2 -u "http://localhost:8080" -p "consider a consecutive series of types to describe the universe and universe of universes, what is your ordering? please create .c language declarations. " -n 4 -g "${GRAMMAR}"
diff --git a/grammar2b.sh b/grammar2b.sh
@@ -1,4 +1,4 @@
-GRAMMAR=~/experiments/gbnf_parser/grammars/ebnf.ebnf
+GRAMMAR=./grammars/ebnf.ebnf
 DS=$(date -Iseconds)
 PROMPT_NAME=prompt_grammar2.txt
 

diff --git a/grammar2cs.sh b/grammar2cs.sh
@@ -1,4 +1,4 @@
-GRAMMAR=~/experiments/gbnf_parser/grammars/ebnf.ebnf
+GRAMMAR=./grammars/ebnf.ebnf
 DS=$(date -Iseconds)
 PROMPT_NAME=prompt_grammar2c.txt
 PROMPT_C="$(cat $PROMPT_NAME)"

diff --git a/grammars/c.gbnf b/grammars/c.gbnf
@@ -0,0 +1,42 @@
+root ::= (declaration)*
+
+declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}"
+
+dataType  ::= "int" ws | "float" ws | "char" ws
+identifier ::= [a-zA-Z_] [a-zA-Z_0-9]*
+
+parameter ::= dataType identifier
+
+statement ::=
+    ( dataType identifier ws "=" ws expression ";" ) |
+    ( identifier ws "=" ws expression ";" ) |
+    ( identifier ws "(" argList? ")" ";" ) |
+    ( "return" ws expression ";" ) |
+    ( "while" "(" condition ")" "{" statement* "}" ) |
+    ( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) |
+    ( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) |
+    ( singleLineComment ) |
+    ( multiLineComment )
+
+forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression
+forUpdate ::= identifier ws "=" ws expression
+
+condition ::= expression relationOperator expression
+relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">")
+
+expression ::= term (("+" | "-") term)*
+term ::= factor(("*" | "/") factor)*
+
+factor ::= identifier | number | unaryTerm | funcCall | parenExpression
+unaryTerm ::= "-" factor
+funcCall ::= identifier "(" argList? ")"
+parenExpression ::= "(" ws expression ws ")"
+
+argList ::= expression ("," ws expression)*
+
+number ::= [0-9]+
+
+singleLineComment ::= "//" [^\n]* "\n"
+multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/"
+
+ws ::= ([ \t\n]+)
diff --git a/grammars/chess.gbnf b/grammars/chess.gbnf
@@ -0,0 +1,13 @@
+# Specifies chess moves as a list in algebraic notation, using PGN conventions
+
+# Force first move to "1. ", then any 1-2 digit number after, relying on model to follow the pattern
+root    ::= "1. " move " " move "\n" ([1-9] [0-9]? ". " move " " move "\n")+
+move    ::= (pawn | nonpawn | castle) [+#]?
+
+# piece type, optional file/rank, optional capture, dest file & rank
+nonpawn ::= [NBKQR] [a-h]? [1-8]? "x"? [a-h] [1-8]
+
+# optional file & capture, dest file & rank, optional promotion
+pawn    ::= ([a-h] "x")? [a-h] [1-8] ("=" [NBKQR])?
+
+castle  ::= "O-O" "-O"?
diff --git a/grammars/ebnf.ebnf.prompt b/grammars/ebnf.ebnf.prompt
@@ -0,0 +1,75 @@
+We are bootstrapping a new system using EBNF grammars.
+We want to make an ebnf grammar that is super detailed and self expressive.
+Here is the code we wrote so far 
+# GBNF (GGML BNF) is a format for defining formal grammars to constrain model outputs in llama.cpp. 
+# Backus-Naur Form (BNF) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
+# In GBNF, we define production rules that specify how a non-terminal (rule name) can be replaced with sequences of terminals (characters, specifically Unicode code points) and other non-terminals. The basic format of a production rule is nonterminal ::= sequence....
+
+production_rule ::= alternation
+lhs ::= identifier
+rule ::= lhs   S   "="   S   production_rule   S    | comment
+root ::= ( S   rule   S ) *
+
+# Terminals support the full range of Unicode. Unicode characters can be specified directly in the grammar, for example hiragana ::= [ぁ-ゟ], or with escapes: 8-bit (\xXX), 16-bit (\uXXXX) or 32-bit (\UXXXXXXXX).
+range ::=  "-"
+factor_range ::= term   S   range   S   term
+
+# Character ranges can be negated with ^:
+negate ::=  "^"
+
+#Sequences and Alternatives
+#The order of symbols in a sequence matter. For example, in "1. " move " " move "\n", the "1. " must come before the first move, etc.
+concatenation ::= ( S   factor   S   ? ) +
+
+# Alternatives, denoted by |, give different sequences that are acceptable. 
+alternation ::=  "|"
+alternation ::= ( S   concatenation   S   alternation ? ) +
+
+# Parentheses () can be used to group sequences, which allows for embedding alternatives in a larger rule or applying repetition and optional symbols (below) to a sequence.
+parens_open ::= "("
+parens_close ::=  ")"
+parens ::= parens_open | parens_close
+
+#Repetition and Optional Symbols
+repetition_symbols ::= repetition_plus | repetition_star | repetition_optional
+
+#* after a symbol or sequence means that it can be repeated zero or more times.
+repetition_star ::=  "*" 
+
+#+ denotes that the symbol or sequence should appear one or more times.
+repetition_plus ::=  "+" 
+
+#? makes the preceding symbol or sequence optional.
+repetition_optional ::=  "?" 
+
+
+#Comments and newlines
+#Comments can be specified with #:
+comment ::= "#" [a-zA-Z0-9 \t]*
+
+# Newlines are allowed between rules and between symbols or sequences nested inside parentheses. Additionally, a newline after an alternate marker | will continue the current rule, even outside of parentheses.
+
+
+letter ::= [a-zA-Z]
+digit ::= [0-9]
+S ::= ( " " | "\n" | "\t" | "\r"  )
+braces_open ::= "[" 
+braces_close ::= "]"
+braces_symbol ::= braces_open | braces_close
+
+quote ::=  "\""
+assignment ::= "::="
+
+symbol ::= braces_symbol | parens | quotes |assignment | alternation | range | repetition_symbols  | negate
+
+character ::= letter | digit | symbol | "_" | " "
+identifier ::= letter ( letter | digit | "_" )*
+terminal ::= quote character+  quote 
+group ::= parens_open   S   production_rule   S   parens_close
+range_term ::=  braces_open   S   production_rule   S   braces_close
+term ::= group |range_term | terminal      | identifier
+
+repetition ::= term   S   occurence  
+factor_negate ::= negate S factor
+factor ::= repetition |factor_range | term   S
+ ENDSRC . Lets rewrite this  EBNF to be more expressive and explicit in its naming but keep the syntax the same for compatibility with GBNF and llama.cpp. 
diff --git a/grammars/ebnf2.ebnf b/grammars/ebnf2.ebnf
@@ -0,0 +1,55 @@
+comment ::= "#" [a-zA-Z0-9 \t]*
+letter ::= [a-zA-Z]
+digit ::= [0-9]
+S ::= ( " " | "\n" | "\t" | "\r" )
+# Removed unused symbol rule
+
+character ::= letter | digit | "_" | " "
+identifier ::= letter ( letter | digit | "_" )*
+terminal ::= "'" character "'" ( character "'" ) "'"
+terminator ::= (";" | ".") 
+
+term ::= "(" S rhs S ")" | "[" S rhs S "]" | "{" S rhs S "}" | terminal | identifier
+
+factor ::= term S "?" | term S "*" | term S "+" | term S "-" S term | term S
+
+concatenation ::= ( S factor S ","? ) +
+alternation ::= ( S concatenation S "|"? ) +
+
+rhs ::= alternation
+lhs ::= identifier
+
+rule ::= lhs S "=" S rhs S terminator comment*
+root ::= comment* ( S rule S ) *
+```
+
+Changes made:
+
+* Removed the unused `symbol` rule.
+* Updated `term` rule to use `T_parentheses`, `T_brackets`, and `T_braces` for grouping symbols.
+* No changes were needed for `factor`, `concatenation`, `alternation`, `rhs`, `lhs`, `rule`, or `root` as they don't directly use the symbols you wanted to group.
+
+I hope this is helpful!
+
+We are bootstrapping a new system using EBNF grammars.
+We want to make an ebnf grammar that is super detailed.
+for each rule we want to create a nameing rule for the rules that have them all start with prd. each token should be called tkn.
+if a token is used in a rule lets make it prd_<prod name>_tkn_<token name>
+Here is the code we wrote so far 
+(*
+folder
+
+   take inputs :
+   grammar : A
+   Previous Results : D initially, Initial example : B
+   New Example : C
+   Created new output D.
+   Test D. If good, repeat loop with new D. Othewise feed error back to create new D up to 3 times.
+   start with this following code and rewrite it to suit our needs.
+*)
+
+let rec fold_left op acc = function
+  | []   -> acc
+  | h :: t -> fold_left op (op acc h) t
+
+ ENDSRC . Lets create a new EBNF that is more expressive and explicit.
diff --git a/simple_grammar.sh b/simple_grammar.sh
@@ -1,4 +1,4 @@
-GRAMMAR=$( cat ~/experiments/gbnf_parser/grammars/ebnf.ebnf)
+GRAMMAR=$( cat ./grammars/ebnf.ebnf)
 echo  "consider a consecutive series of types to describe the universe and universe of universes, what is your ordering?"	> prompt.txt
 
 dune exec ./bin/simple_grammar.exe -- --llamacpp -s test4 -u "http://localhost:8080"  -n 4 -g "$GRAMMAR" -p prompt.txt