-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparsing.R
97 lines (94 loc) · 3.51 KB
/
parsing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
file_size <- '1e6'
file_name <- paste0('measurements.', file_size, '.csv')
states <- c(
"NC", "MA", "TX", "VT", "OR", "NY", "ND", "NV", "SD", "IN",
"ID", "RI", "TN", "SC", "PA", "WV", "CT", "NE", "KY", "DE",
"MT", "ME", "AL", "WI", "IA", "MI", "UT", "LA", "WA", "NM",
"AR", "MO", "MD", "MN", "KS", "AK", "OK", "NH", "NJ", "AZ",
"CA", "HI", "IL", "GA", "WY", "CO", "MS", "VA", "OH", "FL"
)
# chunk_size <- as.numeric(file_size)
# con <- file(file_name)
# lines <- scan(con, n = chunk_size, skip = 1, what = character(), quiet = TRUE)
# close(con)
# lines_list <- strsplit(lines, ",", fixed = TRUE, useBytes = TRUE)
# lines_vector <- unlist(lines_list, recursive = FALSE, use.names = FALSE)
# index <- (1:length(lines_list) %% 2) == 1
# split_measurement <- split(as.double(lines_vector[index]), lines_vector[!index])
# summary_stats <- vapply(split_measurement, function(x) c(min(x), max(x), mean(x)), double(3))
# summary_stats
res <- bench::mark(
str_extract = {
chunk_size <- as.numeric(file_size)
con <- file(file_name)
lines <- scan(con, n = chunk_size, skip = 1, what = character(), quiet = TRUE)
close(con)
m <- lines |>
stringr::str_extract("(\\d+),(.*)", group = c(1, 2)) |>
unlist(recursive = FALSE, use.names = FALSE)
temps <- m[,1]
names(temps) <- m[,2]
temps
},
strsplit = {
chunk_size <- as.numeric(file_size)
con <- file(file_name)
lines <- scan(con, n = chunk_size, skip = 1, what = character(), quiet = TRUE)
close(con)
lines_vector <- lines |>
strsplit(split = ",") |>
unlist(recursive = FALSE, use.names = FALSE)
index <- (1:length(lines_vector) %% 2) == 1
temps <- lines_vector[index]
names(temps) <- lines_vector[!index]
temps
},
strsplit_fixed = {
chunk_size <- as.numeric(file_size)
con <- file(file_name)
lines <- scan(con, n = chunk_size, skip = 1, what = character(), quiet = TRUE)
close(con)
lines_vector <- lines |>
strsplit(split = ',', fixed = TRUE) |>
unlist(recursive = FALSE, use.names = FALSE)
index <- (1:length(lines_vector) %% 2) == 1
temps <- lines_vector[index]
names(temps) <- lines_vector[!index]
temps
},
stringi = {
chunk_size <- as.numeric(file_size)
con <- file(file_name)
lines <- scan(con, n = chunk_size, skip = 1, what = character(), quiet = TRUE)
close(con)
lines_vector <- stringi::stri_split_fixed(lines, ',') |>
unlist(recursive = FALSE, use.names = FALSE)
index <- (1:length(lines_vector) %% 2) == 1
temps <- lines_vector[index]
names(temps) <- lines_vector[!index]
temps
},
stringi2 = {
chunk_size <- as.numeric(file_size)
con <- file(file_name)
lines <- scan(con, n = chunk_size, skip = 1, what = character(), quiet = TRUE)
close(con)
lines_vector <- lines |>
stringi::stri_split_fixed(pattern = ',') |>
unlist(recursive = FALSE, use.names = FALSE)
index <- (1:length(lines_vector) %% 2) == 1
temps <- lines_vector[index]
names(temps) <- lines_vector[!index]
temps
},
read_delim = {
df <- read.delim(file_name, sep=",", header=TRUE)
temps <- df$measurement
names(temps) <- df$state
temps
},
filter_gc = FALSE,
min_iterations = 5,
check = FALSE
)
print(res)