Skip to content

Commit

Permalink
Merge directory from misc-research repo
Browse files Browse the repository at this point in the history
  • Loading branch information
Rucknium committed Aug 10, 2022
2 parents fb39617 + d425716 commit 6a00939
Show file tree
Hide file tree
Showing 4 changed files with 485 additions and 0 deletions.
111 changes: 111 additions & 0 deletions construct-edgelist.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# install.packages("data.table")
# install.packages("RSQLite")
# install.packages("DBI")

library(data.table)
library(RSQLite)
library(DBI)

data.dir <- ""
# Input data directory here, with trailing "/"

source("https://gist.githubusercontent.com/jeffwong/5925000/raw/bf02ed0dd2963169a91664be02fb18e45c4d1e20/sqlitewritetable.R")
# From https://gist.github.com/jeffwong/5925000
# Modifies RSQLite's sqliteWriteTable function so as to reject duplicates

con <- DBI::dbConnect(RSQLite::SQLite(), paste0(data.dir, "tx-graph-node-indices.db"))

DBI::dbExecute(con, "PRAGMA max_page_count = 4294967292;")
# Allows SQL database files up to 4 TB. See:
# https://stackoverflow.com/questions/16685016/sqlite3-operationalerror-database-or-disk-is-full-on-lustre

# DBI::dbExecute(con, "PRAGMA temp_store = 2;")
# Put temp file in RAM:
# https://stackoverflow.com/a/19259699

DBI::dbExecute(con, "CREATE TABLE nodes (
node TEXT,
node_index INTEGER PRIMARY KEY AUTOINCREMENT,
unique(node)
)")


DBI::dbWriteTable(con, "edgelist",
data.frame(origin = character(0), destination = character(0), value = numeric(0),
block_height = integer(0), stringsAsFactors = FALSE))

tx.graph.files <- list.files(paste0(data.dir, "tx_graphs/"))
tx.graph.files <- tx.graph.files[grepl("^tx_graph.+rds$", tx.graph.files)]
tx.graph.files <- sort(tx.graph.files)

tx.graph.indexed <- vector("list", length(tx.graph.files))
names(tx.graph.indexed) <- tx.graph.files

for (file.iter in tx.graph.files) {

tx.graph.chunk <- readRDS(paste0(data.dir, "tx_graphs/", file.iter))

tx.graph.chunk <-
rbind(
data.table(origin = paste0(tx.graph.chunk$incoming$origin.txid, "-",
formatC(tx.graph.chunk$incoming$origin.position, width = 4, format = "f", flag = "0", digits = 0)),
destination = tx.graph.chunk$incoming$txid,
value = NA_real_,
block_height = as.integer(tx.graph.chunk$incoming$block_height), stringsAsFactors = FALSE),
data.table(origin = tx.graph.chunk$outgoing$txid,
destination = paste0(tx.graph.chunk$outgoing$txid, "-",
formatC(tx.graph.chunk$outgoing$position, width = 4, format = "f", flag = "0", digits = 0)),
value = tx.graph.chunk$outgoing$value,
block_height = as.integer(tx.graph.chunk$outgoing$block_height), stringsAsFactors = FALSE)
)

DBI::dbWriteTable(con, "edgelist",
tx.graph.chunk, append = TRUE)

cat(file.iter, base::date(), "\n")

if (nrow(tx.graph.chunk) == 0) {next}

new.nodes <- unique(c(tx.graph.chunk$origin, tx.graph.chunk$destination))

nodes.to.insert <- data.frame(node = new.nodes, node_index = NA, stringsAsFactors = FALSE)

mysqliteWriteTable(con, "nodes",
nodes.to.insert, append = TRUE, row.names = FALSE, ignore = TRUE)

cat(nrow(nodes.to.insert), "Nodes written\n")

}


DBI::dbWriteTable(con, "edgelist_intermediate_1",
data.frame(origin = character(0), destination = character(0),
value = numeric(0), block_height = integer(0),
node_index = integer(0), stringsAsFactors = FALSE), overwrite = TRUE)

base::date()
DBI::dbExecute(con, "INSERT INTO edgelist_intermediate_1 SELECT
origin, destination, value, block_height, node_index FROM
edgelist JOIN nodes ON edgelist.origin = nodes.node")
base::date()


DBI::dbExecute(con,
"ALTER TABLE edgelist_intermediate_1 RENAME COLUMN node_index TO origin_index")


DBI::dbWriteTable(con, "edgelist_intermediate_2",
data.frame(origin = character(0), destination = character(0),
origin_index = integer(0), node_index = integer(0),
value = numeric(0), block_height = integer(0), stringsAsFactors = FALSE))

base::date()
DBI::dbExecute(con, "INSERT INTO edgelist_intermediate_2 SELECT
origin, destination, origin_index, node_index, value, block_height FROM
edgelist_intermediate_1 JOIN nodes ON edgelist_intermediate_1.destination = nodes.node")
base::date()

DBI::dbExecute(con,
"ALTER TABLE edgelist_intermediate_2 RENAME COLUMN node_index TO destination_index")


143 changes: 143 additions & 0 deletions create-dataset-for-analysis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@

library(data.table)
library(RSQLite)
library(DBI)
# NOTE: Also need lubridate package installed, but not loading it due to
# it masking functions

is.btc <- FALSE
# Change to TRUE if processing BTC

data.dir <- ""
# Input data directory here, with trailing "/"


con <- DBI::dbConnect(RSQLite::SQLite(), paste0(data.dir, "tx-graph-node-indices.db"))


if ( ! is.btc) {

master.edgelist.output.created <- master.edgelist[
(! is.na(master.edgelist$value)) & master.edgelist$value > 0 , c("destination_index", "block_height")]
colnames(master.edgelist.output.created) <- c("output_index", "output.created.block_height")
setDT(master.edgelist.output.created)

master.edgelist.output.spent <- master.edgelist[, c("origin_index", "block_height")]
colnames(master.edgelist.output.spent) <- c("output_index", "output.spent.block_height")
setDT(master.edgelist.output.spent)

# Only include positive _value_s for output created, since that's the value of the created output
# Then, below do an "inner merge" to get the proper outputs on the spending side

rm(master.edgelist)
master.edgelist.output.spent <- merge(master.edgelist.output.created, master.edgelist.output.spent)
rm(master.edgelist.output.created)

} else {

# Split data requests and processing if BTC to avoid integer overflow issue in this step of processing.
# data.table issue:
# https://github.com/Rdatatable/data.table/issues/3957
# SQLite issue: "negative length vectors are not allowed"

master.edgelist.lt.500k <- DBI::dbGetQuery(con,
"SELECT origin_index, destination_index,block_height,value FROM edgelist_intermediate_2 WHERE block_height < 500000")

master.edgelist.output.created.lt.500k <- master.edgelist.lt.500k[
(! is.na(master.edgelist.lt.500k$value)) & master.edgelist.lt.500k$value > 0 , c("destination_index", "block_height")]
colnames(master.edgelist.output.created.lt.500k) <- c("output_index", "output.created.block_height")
setDT(master.edgelist.output.created.lt.500k)

master.edgelist.output.spent.lt.500k <- master.edgelist.lt.500k[, c("origin_index", "block_height")]
colnames(master.edgelist.output.spent.lt.500k) <- c("output_index", "output.spent.block_height")
setDT(master.edgelist.output.spent.lt.500k)

rm(master.edgelist.lt.500k)


master.edgelist.gt.500k.lt.700k <- DBI::dbGetQuery(con,
"SELECT origin_index, destination_index,block_height,value FROM edgelist_intermediate_2 WHERE block_height > 499999 AND block_height < 700000")

master.edgelist.output.created.gt.500k.lt.700k <- master.edgelist.gt.500k.lt.700k[
(! is.na(master.edgelist.gt.500k.lt.700k$value) & master.edgelist.gt.500k.lt.700k$value > 0 ) , c("destination_index", "block_height")]
colnames(master.edgelist.output.created.gt.500k.lt.700k) <- c("output_index", "output.created.block_height")
setDT(master.edgelist.output.created.gt.500k.lt.700k)

master.edgelist.output.spent.gt.500k.lt.700k <- master.edgelist.gt.500k.lt.700k[, c("origin_index", "block_height")]
colnames(master.edgelist.output.spent.gt.500k.lt.700k) <- c("output_index", "output.spent.block_height")
setDT(master.edgelist.output.spent.gt.500k.lt.700k)

rm(master.edgelist.gt.500k.lt.700k)


master.edgelist.gt.700k <- DBI::dbGetQuery(con,
"SELECT origin_index, destination_index,block_height,value FROM edgelist_intermediate_2 WHERE block_height > 699999")

master.edgelist.output.created.gt.700k <- master.edgelist.gt.700k[
(! is.na(master.edgelist.gt.700k$value) & master.edgelist.gt.700k$value > 0 ), c("destination_index", "block_height")]
colnames(master.edgelist.output.created.gt.700k) <- c("output_index", "output.created.block_height")
setDT(master.edgelist.output.created.gt.700k)

master.edgelist.output.spent.gt.700k <- master.edgelist.gt.700k[, c("origin_index", "block_height")]
colnames(master.edgelist.output.spent.gt.700k) <- c("output_index", "output.spent.block_height")
setDT(master.edgelist.output.spent.gt.700k)

rm(master.edgelist.gt.700k)


master.edgelist.output.created <- rbindlist(list(master.edgelist.output.created.lt.500k,
master.edgelist.output.created.gt.500k.lt.700k, master.edgelist.output.created.gt.700k))

master.edgelist.output.spent.lt.500k <- merge(master.edgelist.output.created, master.edgelist.output.spent.lt.500k)
master.edgelist.output.spent.gt.500k.lt.700k <- merge(master.edgelist.output.created, master.edgelist.output.spent.gt.500k.lt.700k)
master.edgelist.output.spent.gt.700k <- merge(master.edgelist.output.created, master.edgelist.output.spent.gt.700k)

master.edgelist.output.spent <- rbindlist(list(master.edgelist.output.spent.lt.500k,
master.edgelist.output.spent.gt.500k.lt.700k, master.edgelist.output.spent.gt.700k))

rm(master.edgelist.output.created)

}



block.times <- readRDS(paste0(data.dir, "block_times.rds"))
colnames(block.times) <- c("output.created.block_height", "output.created.block_time")
setDT(block.times)

master.edgelist.output.spent <- merge(master.edgelist.output.spent, block.times, by = "output.created.block_height")

colnames(block.times) <- c("output.spent.block_height", "output.spent.block_time")

master.edgelist.output.spent <- merge(master.edgelist.output.spent, block.times, by = "output.spent.block_height")

# TODO: Explore phenomenon of out-of-order block timestamps and decide what to do about them

master.edgelist.output.spent[, output.spend.age := output.spent.block_time - output.created.block_time]

# These reduce RAM usage if desired:
# master.edgelist.output.spent[, output.created.block_time := NULL]
# master.edgelist.output.spent[, output.created.block_height := NULL]
# master.edgelist.output.spent[, output_index := NULL]


output.spent.block_time.intermediate <- unique(master.edgelist.output.spent[, .(output.spent.block_time)])

output.spent.block_time.intermediate[, output.spent.block_time.week :=
lubridate::isoweek(as.POSIXct(output.spent.block_time, origin = "1970-01-01", tz = "UTC"))]

output.spent.block_time.intermediate[, output.spent.block_time.isoweekyear :=
lubridate::isoyear(as.POSIXct(output.spent.block_time, origin = "1970-01-01", tz = "UTC"))]

output.spent.block_time.intermediate[,
output.spent.block_time.week := factor(paste0(output.spent.block_time.isoweekyear, "-",
formatC(output.spent.block_time.week, width = 2, flag = "0")))]

master.edgelist.output.spent <- merge(master.edgelist.output.spent,
output.spent.block_time.intermediate[, .(output.spent.block_time, output.spent.block_time.week)], by = "output.spent.block_time")

saveRDS(master.edgelist.output.spent, paste0(data.dir, "master_edgelist_output_spent.rds"))

saveRDS(master.edgelist.output.spent, paste0(data.dir, "master_edgelist_output_spent-uncompressed.rds"), compress = FALSE)


Loading

0 comments on commit 6a00939

Please sign in to comment.