Merge directory from misc-research repo

Rucknium · Aug 10, 2022 · 6a00939 · 6a00939
2 parents fb39617 + d425716
commit 6a00939
Show file tree

Hide file tree

Showing 4 changed files with 485 additions and 0 deletions.
diff --git a/construct-edgelist.R b/construct-edgelist.R
@@ -0,0 +1,111 @@
+# install.packages("data.table")
+# install.packages("RSQLite")
+# install.packages("DBI")
+
+library(data.table)
+library(RSQLite)
+library(DBI)
+
+data.dir <- ""
+# Input data directory here, with trailing "/"
+
+source("https://gist.githubusercontent.com/jeffwong/5925000/raw/bf02ed0dd2963169a91664be02fb18e45c4d1e20/sqlitewritetable.R")
+# From https://gist.github.com/jeffwong/5925000
+# Modifies RSQLite's sqliteWriteTable function so as to reject duplicates
+
+con <- DBI::dbConnect(RSQLite::SQLite(), paste0(data.dir, "tx-graph-node-indices.db"))
+
+DBI::dbExecute(con, "PRAGMA max_page_count = 4294967292;")
+# Allows SQL database files up to 4 TB. See:
+# https://stackoverflow.com/questions/16685016/sqlite3-operationalerror-database-or-disk-is-full-on-lustre
+
+# DBI::dbExecute(con, "PRAGMA temp_store = 2;")
+# Put temp file in RAM:
+# https://stackoverflow.com/a/19259699
+
+DBI::dbExecute(con, "CREATE TABLE nodes (
+node TEXT,
+node_index INTEGER PRIMARY KEY AUTOINCREMENT,
+unique(node)
+)")
+
+
+DBI::dbWriteTable(con, "edgelist", 
+  data.frame(origin = character(0), destination = character(0), value = numeric(0),
+    block_height = integer(0), stringsAsFactors = FALSE))
+
+tx.graph.files <- list.files(paste0(data.dir, "tx_graphs/"))
+tx.graph.files <- tx.graph.files[grepl("^tx_graph.+rds$", tx.graph.files)]
+tx.graph.files <- sort(tx.graph.files)
+
+tx.graph.indexed <- vector("list", length(tx.graph.files))
+names(tx.graph.indexed) <- tx.graph.files
+
+for (file.iter in tx.graph.files) {
+
+  tx.graph.chunk <- readRDS(paste0(data.dir, "tx_graphs/", file.iter))
+
+  tx.graph.chunk <-
+    rbind(
+      data.table(origin = paste0(tx.graph.chunk$incoming$origin.txid, "-", 
+        formatC(tx.graph.chunk$incoming$origin.position, width = 4, format = "f", flag = "0", digits = 0)),
+        destination = tx.graph.chunk$incoming$txid, 
+        value = NA_real_,
+        block_height = as.integer(tx.graph.chunk$incoming$block_height), stringsAsFactors = FALSE),
+      data.table(origin = tx.graph.chunk$outgoing$txid,
+        destination = paste0(tx.graph.chunk$outgoing$txid, "-", 
+          formatC(tx.graph.chunk$outgoing$position, width = 4, format = "f", flag = "0", digits = 0)),
+        value = tx.graph.chunk$outgoing$value,
+        block_height = as.integer(tx.graph.chunk$outgoing$block_height), stringsAsFactors = FALSE)
+    )
+
+  DBI::dbWriteTable(con, "edgelist", 
+    tx.graph.chunk, append = TRUE)
+
+  cat(file.iter, base::date(), "\n")
+
+  if (nrow(tx.graph.chunk) == 0) {next}
+
+  new.nodes <- unique(c(tx.graph.chunk$origin, tx.graph.chunk$destination))
+
+  nodes.to.insert <- data.frame(node = new.nodes, node_index = NA, stringsAsFactors = FALSE)
+
+  mysqliteWriteTable(con, "nodes", 
+    nodes.to.insert, append = TRUE, row.names = FALSE, ignore = TRUE)
+
+  cat(nrow(nodes.to.insert), "Nodes written\n")
+
+}
+
+
+DBI::dbWriteTable(con, "edgelist_intermediate_1", 
+  data.frame(origin = character(0), destination = character(0),
+    value = numeric(0), block_height = integer(0),
+    node_index = integer(0), stringsAsFactors = FALSE), overwrite = TRUE)
+
+base::date()
+DBI::dbExecute(con, "INSERT INTO edgelist_intermediate_1 SELECT 
+  origin, destination, value, block_height, node_index FROM
+  edgelist JOIN nodes ON edgelist.origin = nodes.node")
+base::date()
+
+
+DBI::dbExecute(con, 
+  "ALTER TABLE edgelist_intermediate_1 RENAME COLUMN node_index TO origin_index")
+
+
+DBI::dbWriteTable(con, "edgelist_intermediate_2", 
+  data.frame(origin = character(0), destination = character(0),
+    origin_index = integer(0), node_index = integer(0),
+    value = numeric(0), block_height = integer(0), stringsAsFactors = FALSE))
+
+base::date()
+DBI::dbExecute(con, "INSERT INTO edgelist_intermediate_2 SELECT 
+  origin, destination, origin_index, node_index, value, block_height FROM
+  edgelist_intermediate_1 JOIN nodes ON edgelist_intermediate_1.destination = nodes.node")
+base::date()
+
+DBI::dbExecute(con, 
+  "ALTER TABLE edgelist_intermediate_2 RENAME COLUMN node_index TO destination_index")
+
+
diff --git a/create-dataset-for-analysis.R b/create-dataset-for-analysis.R
@@ -0,0 +1,143 @@
+
+library(data.table)
+library(RSQLite)
+library(DBI)
+# NOTE: Also need lubridate package installed, but not loading it due to 
+# it masking functions
+
+is.btc <- FALSE
+# Change to TRUE if processing BTC
+
+data.dir <- ""
+# Input data directory here, with trailing "/"
+
+
+con <- DBI::dbConnect(RSQLite::SQLite(), paste0(data.dir, "tx-graph-node-indices.db"))
+
+
+if ( ! is.btc) {
+
+  master.edgelist.output.created <- master.edgelist[
+    (! is.na(master.edgelist$value)) & master.edgelist$value > 0 , c("destination_index", "block_height")]
+  colnames(master.edgelist.output.created) <- c("output_index", "output.created.block_height")
+  setDT(master.edgelist.output.created)
+
+  master.edgelist.output.spent <- master.edgelist[, c("origin_index", "block_height")]
+  colnames(master.edgelist.output.spent) <- c("output_index", "output.spent.block_height")
+  setDT(master.edgelist.output.spent)
+
+  # Only include positive _value_s for output created, since that's the value of the created output
+  # Then, below do an "inner merge" to get the proper outputs on the spending side
+
+  rm(master.edgelist)
+  master.edgelist.output.spent <- merge(master.edgelist.output.created, master.edgelist.output.spent)
+  rm(master.edgelist.output.created)
+
+} else {
+
+  # Split data requests and processing if BTC to avoid integer overflow issue in this step of processing.
+  # data.table issue:
+  # https://github.com/Rdatatable/data.table/issues/3957
+  # SQLite issue: "negative length vectors are not allowed"
+
+  master.edgelist.lt.500k <- DBI::dbGetQuery(con, 
+    "SELECT origin_index, destination_index,block_height,value FROM edgelist_intermediate_2 WHERE block_height < 500000")
+
+  master.edgelist.output.created.lt.500k <- master.edgelist.lt.500k[
+    (! is.na(master.edgelist.lt.500k$value)) & master.edgelist.lt.500k$value > 0 , c("destination_index", "block_height")]
+  colnames(master.edgelist.output.created.lt.500k) <- c("output_index", "output.created.block_height")
+  setDT(master.edgelist.output.created.lt.500k)
+
+  master.edgelist.output.spent.lt.500k <- master.edgelist.lt.500k[, c("origin_index", "block_height")]
+  colnames(master.edgelist.output.spent.lt.500k) <- c("output_index", "output.spent.block_height")
+  setDT(master.edgelist.output.spent.lt.500k)
+
+  rm(master.edgelist.lt.500k)
+
+
+  master.edgelist.gt.500k.lt.700k <- DBI::dbGetQuery(con, 
+    "SELECT origin_index, destination_index,block_height,value FROM edgelist_intermediate_2 WHERE block_height > 499999 AND block_height < 700000")
+
+  master.edgelist.output.created.gt.500k.lt.700k <- master.edgelist.gt.500k.lt.700k[
+    (! is.na(master.edgelist.gt.500k.lt.700k$value) & master.edgelist.gt.500k.lt.700k$value > 0 ) , c("destination_index", "block_height")]
+  colnames(master.edgelist.output.created.gt.500k.lt.700k) <- c("output_index", "output.created.block_height")
+  setDT(master.edgelist.output.created.gt.500k.lt.700k)
+
+  master.edgelist.output.spent.gt.500k.lt.700k <- master.edgelist.gt.500k.lt.700k[, c("origin_index", "block_height")]
+  colnames(master.edgelist.output.spent.gt.500k.lt.700k) <- c("output_index", "output.spent.block_height")
+  setDT(master.edgelist.output.spent.gt.500k.lt.700k)
+
+  rm(master.edgelist.gt.500k.lt.700k)
+
+
+  master.edgelist.gt.700k <- DBI::dbGetQuery(con, 
+    "SELECT origin_index, destination_index,block_height,value FROM edgelist_intermediate_2 WHERE block_height > 699999")
+
+  master.edgelist.output.created.gt.700k <- master.edgelist.gt.700k[
+    (! is.na(master.edgelist.gt.700k$value) & master.edgelist.gt.700k$value > 0 ), c("destination_index", "block_height")]
+  colnames(master.edgelist.output.created.gt.700k) <- c("output_index", "output.created.block_height")
+  setDT(master.edgelist.output.created.gt.700k)
+
+  master.edgelist.output.spent.gt.700k <- master.edgelist.gt.700k[, c("origin_index", "block_height")]
+  colnames(master.edgelist.output.spent.gt.700k) <- c("output_index", "output.spent.block_height")
+  setDT(master.edgelist.output.spent.gt.700k)
+
+  rm(master.edgelist.gt.700k)
+
+
+  master.edgelist.output.created <- rbindlist(list(master.edgelist.output.created.lt.500k, 
+    master.edgelist.output.created.gt.500k.lt.700k, master.edgelist.output.created.gt.700k))
+
+  master.edgelist.output.spent.lt.500k <- merge(master.edgelist.output.created, master.edgelist.output.spent.lt.500k)
+  master.edgelist.output.spent.gt.500k.lt.700k <- merge(master.edgelist.output.created, master.edgelist.output.spent.gt.500k.lt.700k)
+  master.edgelist.output.spent.gt.700k <- merge(master.edgelist.output.created, master.edgelist.output.spent.gt.700k)
+
+  master.edgelist.output.spent <- rbindlist(list(master.edgelist.output.spent.lt.500k, 
+    master.edgelist.output.spent.gt.500k.lt.700k, master.edgelist.output.spent.gt.700k))
+
+  rm(master.edgelist.output.created)
+
+}
+
+
+
+block.times <- readRDS(paste0(data.dir, "block_times.rds"))
+colnames(block.times) <- c("output.created.block_height", "output.created.block_time")
+setDT(block.times)
+
+master.edgelist.output.spent <- merge(master.edgelist.output.spent, block.times, by = "output.created.block_height")
+
+colnames(block.times) <- c("output.spent.block_height", "output.spent.block_time")
+
+master.edgelist.output.spent <- merge(master.edgelist.output.spent, block.times, by = "output.spent.block_height")
+
+# TODO: Explore phenomenon of out-of-order block timestamps and decide what to do about them
+
+master.edgelist.output.spent[, output.spend.age := output.spent.block_time - output.created.block_time]
+
+# These reduce RAM usage if desired:
+# master.edgelist.output.spent[, output.created.block_time := NULL]
+# master.edgelist.output.spent[, output.created.block_height := NULL]
+# master.edgelist.output.spent[, output_index := NULL]
+
+
+output.spent.block_time.intermediate <- unique(master.edgelist.output.spent[, .(output.spent.block_time)])
+
+output.spent.block_time.intermediate[, output.spent.block_time.week :=
+    lubridate::isoweek(as.POSIXct(output.spent.block_time, origin = "1970-01-01", tz = "UTC"))]
+
+output.spent.block_time.intermediate[, output.spent.block_time.isoweekyear :=
+    lubridate::isoyear(as.POSIXct(output.spent.block_time, origin = "1970-01-01", tz = "UTC"))]
+
+output.spent.block_time.intermediate[,
+  output.spent.block_time.week := factor(paste0(output.spent.block_time.isoweekyear, "-", 
+    formatC(output.spent.block_time.week, width = 2, flag = "0")))]
+
+master.edgelist.output.spent <- merge(master.edgelist.output.spent, 
+  output.spent.block_time.intermediate[, .(output.spent.block_time, output.spent.block_time.week)], by = "output.spent.block_time")
+
+saveRDS(master.edgelist.output.spent, paste0(data.dir, "master_edgelist_output_spent.rds"))
+
+saveRDS(master.edgelist.output.spent, paste0(data.dir, "master_edgelist_output_spent-uncompressed.rds"), compress = FALSE)
+
+