Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add utf8 support for string literal #127

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# dev
- Add utf8 support for string literal (#127)

# 3.3 (2024-10-29)
- Add support for unicode `16.0.0` (#157)
- Add API for retrieving start and stop positions separately (#155)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ with a length different from one.


Note:
- The OCaml source is assumed to be encoded in Latin1 (for string
- The OCaml source is assumed to be encoded in utf8 (for string
and character literals).


Expand Down
72 changes: 43 additions & 29 deletions src/syntax/ppx_sedlex.ml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ module Cset = Sedlex_cset
(* Decision tree for partitions *)

let default_loc = Location.none
let lident_loc ~loc s = { loc; txt = lident s }

type decision_tree =
| Lte of int * decision_tree * decision_tree
Expand Down Expand Up @@ -285,29 +284,38 @@ let codepoint i =
failwith (Printf.sprintf "Invalid Unicode code point: %i" i);
i

let regexp_for_char c = Sedlex.chars (Cset.singleton (Char.code c))

let regexp_for_string s =
let rec aux n =
if n = String.length s then Sedlex.eps
else Sedlex.seq (regexp_for_char s.[n]) (aux (succ n))
in
aux 0
let char c = Cset.singleton (Char.code c)
let uchar c = Cset.singleton (Uchar.to_int c)

let err loc s =
raise (Location.Error (Location.Error.createf ~loc "Sedlex: %s" s))

let rev_csets_of_string ~loc ~utf8 s =
if utf8 then
Utf8.fold
~f:(fun acc _ x ->
match x with
| `Malformed _ -> err loc "Malformed utf-8 string"
| `Uchar c -> uchar c :: acc)
[] s
else (
let l = ref [] in
for i = 0 to String.length s - 1 do
l := char s.[i] :: !l
done;
!l)

let rec repeat r = function
| 0, 0 -> Sedlex.eps
| 0, m -> Sedlex.alt Sedlex.eps (Sedlex.seq r (repeat r (0, m - 1)))
| n, m -> Sedlex.seq r (repeat r (n - 1, m - 1))

let regexp_of_pattern env =
let rec char_pair_op func name p tuple =
let rec char_pair_op func name ~utf8 p tuple =
(* Construct something like Sub(a,b) *)
match tuple with
| Some { ppat_desc = Ppat_tuple [p0; p1] } -> begin
match func (aux p0) (aux p1) with
match func (aux ~utf8 p0) (aux ~utf8 p1) with
| Some r -> r
| None ->
err p.ppat_loc @@ "the " ^ name
Expand All @@ -317,16 +325,20 @@ let regexp_of_pattern env =
| _ ->
err p.ppat_loc @@ "the " ^ name
^ " operator requires two arguments, like " ^ name ^ "(a,b)"
and aux p =
and aux ~utf8 p =
(* interpret one pattern node *)
match p.ppat_desc with
| Ppat_or (p1, p2) -> Sedlex.alt (aux p1) (aux p2)
| Ppat_or (p1, p2) -> Sedlex.alt (aux ~utf8 p1) (aux ~utf8 p2)
| Ppat_tuple (p :: pl) ->
List.fold_left (fun r p -> Sedlex.seq r (aux p)) (aux p) pl
List.fold_left
(fun r p -> Sedlex.seq r (aux ~utf8 p))
(aux ~utf8 p) pl
| Ppat_construct ({ txt = Lident "Star" }, Some (_, p)) ->
Sedlex.rep (aux p)
Sedlex.rep (aux ~utf8 p)
| Ppat_construct ({ txt = Lident "Plus" }, Some (_, p)) ->
Sedlex.plus (aux p)
Sedlex.plus (aux ~utf8 p)
| Ppat_construct ({ txt = Lident "Utf8" }, Some (_, p)) ->
aux ~utf8:true p
| Ppat_construct
( { txt = Lident "Rep" },
Some
Expand All @@ -346,19 +358,19 @@ let regexp_of_pattern env =
| Pconst_integer (i1, _), Pconst_integer (i2, _) ->
let i1 = int_of_string i1 in
let i2 = int_of_string i2 in
if 0 <= i1 && i1 <= i2 then repeat (aux p0) (i1, i2)
if 0 <= i1 && i1 <= i2 then repeat (aux ~utf8 p0) (i1, i2)
else err p.ppat_loc "Invalid range for Rep operator"
| _ ->
err p.ppat_loc "Rep must take an integer constant or interval"
end
| Ppat_construct ({ txt = Lident "Rep" }, _) ->
err p.ppat_loc "the Rep operator takes 2 arguments"
| Ppat_construct ({ txt = Lident "Opt" }, Some (_, p)) ->
Sedlex.alt Sedlex.eps (aux p)
Sedlex.alt Sedlex.eps (aux ~utf8 p)
| Ppat_construct ({ txt = Lident "Compl" }, arg) -> begin
match arg with
| Some (_, p0) -> begin
match Sedlex.compl (aux p0) with
match Sedlex.compl (aux ~utf8 p0) with
| Some r -> r
| None ->
err p.ppat_loc
Expand All @@ -368,10 +380,10 @@ let regexp_of_pattern env =
| _ -> err p.ppat_loc "the Compl operator requires an argument"
end
| Ppat_construct ({ txt = Lident "Sub" }, arg) ->
char_pair_op Sedlex.subtract "Sub" p
char_pair_op ~utf8 Sedlex.subtract "Sub" p
(Option.map (fun (_, arg) -> arg) arg)
| Ppat_construct ({ txt = Lident "Intersect" }, arg) ->
char_pair_op Sedlex.intersection "Intersect" p
char_pair_op ~utf8 Sedlex.intersection "Intersect" p
(Option.map (fun (_, arg) -> arg) arg)
| Ppat_construct ({ txt = Lident "Chars" }, arg) -> (
let const =
Expand All @@ -381,11 +393,9 @@ let regexp_of_pattern env =
in
match const with
| Some (Pconst_string (s, _, _)) ->
let c = ref Cset.empty in
for i = 0 to String.length s - 1 do
c := Cset.union !c (Cset.singleton (Char.code s.[i]))
done;
Sedlex.chars !c
let l = rev_csets_of_string ~loc:p.ppat_loc ~utf8 s in
let chars = List.fold_left Cset.union Cset.empty l in
Sedlex.chars chars
| _ ->
err p.ppat_loc "the Chars operator requires a string argument")
| Ppat_interval (i_start, i_end) -> begin
Expand All @@ -401,8 +411,12 @@ let regexp_of_pattern env =
end
| Ppat_constant const -> begin
match const with
| Pconst_string (s, _, _) -> regexp_for_string s
| Pconst_char c -> regexp_for_char c
| Pconst_string (s, _, _) ->
let rev_l = rev_csets_of_string s ~loc:p.ppat_loc ~utf8 in
List.fold_left
(fun acc cset -> Sedlex.seq (Sedlex.chars cset) acc)
Sedlex.eps rev_l
| Pconst_char c -> Sedlex.chars (char c)
| Pconst_integer (i, _) ->
Sedlex.chars (Cset.singleton (codepoint (int_of_string i)))
| _ -> err p.ppat_loc "this pattern is not a valid regexp"
Expand All @@ -414,7 +428,7 @@ let regexp_of_pattern env =
end
| _ -> err p.ppat_loc "this pattern is not a valid regexp"
in
aux
aux ~utf8:false

let previous = ref []
let regexps = ref []
Expand Down
73 changes: 73 additions & 0 deletions src/syntax/utf8.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
let unsafe_byte s j = Char.code (String.unsafe_get s j)
let malformed s j l = `Malformed (String.sub s j l)

let width = function
| '\000' .. '\127' -> 1
| '\192' .. '\223' -> 2
| '\224' .. '\239' -> 3
| '\240' .. '\247' -> 4
| _ -> 0

let r_utf_8 s j l =
(* assert (0 <= j && 0 <= l && j + l <= String.length s); *)
let uchar c = `Uchar (Uchar.unsafe_of_int c) in
match l with
| 1 -> uchar (unsafe_byte s j)
| 2 ->
let b0 = unsafe_byte s j in
let b1 = unsafe_byte s (j + 1) in
if b1 lsr 6 != 0b10 then malformed s j l
else uchar (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F))
| 3 ->
let b0 = unsafe_byte s j in
let b1 = unsafe_byte s (j + 1) in
let b2 = unsafe_byte s (j + 2) in
let c =
((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F)
in
if b2 lsr 6 != 0b10 then malformed s j l
else begin
match b0 with
| 0xE0 ->
if b1 < 0xA0 || 0xBF < b1 then malformed s j l else uchar c
| 0xED ->
if b1 < 0x80 || 0x9F < b1 then malformed s j l else uchar c
| _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c
end
| 4 ->
let b0 = unsafe_byte s j in
let b1 = unsafe_byte s (j + 1) in
let b2 = unsafe_byte s (j + 2) in
let b3 = unsafe_byte s (j + 3) in
let c =
((b0 land 0x07) lsl 18)
lor ((b1 land 0x3F) lsl 12)
lor ((b2 land 0x3F) lsl 6)
lor (b3 land 0x3F)
in
if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then malformed s j l
else begin
match b0 with
| 0xF0 ->
if b1 < 0x90 || 0xBF < b1 then malformed s j l else uchar c
| 0xF4 ->
if b1 < 0x80 || 0x8F < b1 then malformed s j l else uchar c
| _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c
end
| _ -> assert false

let fold ~f acc s =
let rec loop acc f s i last =
if i > last then acc
else (
let need = width (String.unsafe_get s i) in
if need = 0 then loop (f acc i (malformed s i 1)) f s (i + 1) last
else (
let rem = last - i + 1 in
if rem < need then f acc i (malformed s i rem)
else loop (f acc i (r_utf_8 s i need)) f s (i + need) last))
in
let pos = 0 in
let len = String.length s in
let last = pos + len - 1 in
loop acc f s pos last
5 changes: 5 additions & 0 deletions src/syntax/utf8.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
val fold :
f:('a -> int -> [> `Malformed of string | `Uchar of Uchar.t ] -> 'a) ->
'a ->
string ->
'a
19 changes: 19 additions & 0 deletions test/utf8.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
open Printf

let next_tok buf =
let open Sedlexing.Utf8 in
match%sedlex buf with
| "a", Utf8 (Chars "+-×÷") -> sprintf "with Chars: %s" (lexeme buf)
| "b", Utf8 ("+" | "-" | "×" | "÷") ->
sprintf "with or_pattern: %s" (lexeme buf)
| _ -> failwith (sprintf "Unexpected character: %s" (lexeme buf))

let%expect_test _ =
Sedlexing.Utf8.from_string "a+" |> next_tok |> print_string;
[%expect {| with Chars: a+ |}];
Sedlexing.Utf8.from_string "a÷" |> next_tok |> print_string;
[%expect {| with Chars: a÷ |}];
Sedlexing.Utf8.from_string "b+" |> next_tok |> print_string;
[%expect {| with or_pattern: b+ |}];
Sedlexing.Utf8.from_string "b÷" |> next_tok |> print_string;
[%expect {| with or_pattern: b÷ |}]
Loading