diff --git a/construct-edgelist.R b/construct-edgelist.R new file mode 100644 index 0000000..2013779 --- /dev/null +++ b/construct-edgelist.R @@ -0,0 +1,103 @@ +# install.packages("data.table") +# install.packages("RSQLite") +# install.packages("DBI") + +library(data.table) +library(RSQLite) +library(DBI) + +data.dir <- "" +# Input data directory here, with trailing "/" + +source("https://gist.githubusercontent.com/jeffwong/5925000/raw/bf02ed0dd2963169a91664be02fb18e45c4d1e20/sqlitewritetable.R") +# From https://gist.github.com/jeffwong/5925000 +# Modifies RSQLite's sqliteWriteTable function so as to reject duplicates + +con <- DBI::dbConnect(RSQLite::SQLite(), paste0(data.dir, "tx-graph-node-indices.db")) + +DBI::dbExecute(con, "CREATE TABLE nodes ( +node TEXT, +node_index INTEGER PRIMARY KEY AUTOINCREMENT, +unique(node) +)") + + +DBI::dbWriteTable(con, "edgelist", + data.frame(origin = character(0), destination = character(0), value = numeric(0), + block_height = integer(0), stringsAsFactors = FALSE)) + +tx.graph.files <- list.files(paste0(data.dir, "tx_graphs/")) +tx.graph.files <- tx.graph.files[grepl("^tx_graph.+rds$", tx.graph.files)] +tx.graph.files <- sort(tx.graph.files) + +tx.graph.indexed <- vector("list", length(tx.graph.files)) +names(tx.graph.indexed) <- tx.graph.files + +for (file.iter in tx.graph.files) { + + tx.graph.chunk <- readRDS(paste0(data.dir, "tx_graphs/", file.iter)) + + tx.graph.chunk <- + rbind( + data.table(origin = paste0(tx.graph.chunk$incoming$origin.txid, "-", + formatC(tx.graph.chunk$incoming$origin.position, width = 4, format = "f", flag = "0", digits = 0)), + destination = tx.graph.chunk$incoming$txid, + value = NA_real_, + block_height = as.integer(tx.graph.chunk$incoming$block_height), stringsAsFactors = FALSE), + data.table(origin = tx.graph.chunk$outgoing$txid, + destination = paste0(tx.graph.chunk$outgoing$txid, "-", + formatC(tx.graph.chunk$outgoing$position, width = 4, format = "f", flag = "0", digits = 0)), + value = tx.graph.chunk$outgoing$value, + block_height = as.integer(tx.graph.chunk$outgoing$block_height), stringsAsFactors = FALSE) + ) + + DBI::dbWriteTable(con, "edgelist", + tx.graph.chunk, append = TRUE) + + cat(file.iter, base::date(), "\n") + + if (nrow(tx.graph.chunk) == 0) {next} + + new.nodes <- unique(c(tx.graph.chunk$origin, tx.graph.chunk$destination)) + + nodes.to.insert <- data.frame(node = new.nodes, node_index = NA, stringsAsFactors = FALSE) + + mysqliteWriteTable(con, "nodes", + nodes.to.insert, append = TRUE, row.names = FALSE, ignore = TRUE) + + cat(nrow(nodes.to.insert), "Nodes written\n") + +} + + +DBI::dbWriteTable(con, "edgelist_intermediate_1", + data.frame(origin = character(0), destination = character(0), + value = numeric(0), block_height = integer(0), + node_index = integer(0), stringsAsFactors = FALSE), overwrite = TRUE) + +base::date() +DBI::dbExecute(con, "INSERT INTO edgelist_intermediate_1 SELECT + origin, destination, value, block_height, node_index FROM + edgelist JOIN nodes ON edgelist.origin = nodes.node") +base::date() + + +DBI::dbExecute(con, + "ALTER TABLE edgelist_intermediate_1 RENAME COLUMN node_index TO origin_index") + + +DBI::dbWriteTable(con, "edgelist_intermediate_2", + data.frame(origin = character(0), destination = character(0), + origin_index = integer(0), node_index = integer(0), + value = numeric(0), block_height = integer(0), stringsAsFactors = FALSE)) + +base::date() +DBI::dbExecute(con, "INSERT INTO edgelist_intermediate_2 SELECT + origin, destination, origin_index, node_index, value, block_height FROM + edgelist_intermediate_1 JOIN nodes ON edgelist_intermediate_1.destination = nodes.node") +base::date() + +DBI::dbExecute(con, + "ALTER TABLE edgelist_intermediate_2 RENAME COLUMN node_index TO destination_index") + + diff --git a/create-dataset-for-analysis.R b/create-dataset-for-analysis.R new file mode 100644 index 0000000..c56eb39 --- /dev/null +++ b/create-dataset-for-analysis.R @@ -0,0 +1,73 @@ + +library(data.table) +library(RSQLite) +library(DBI) +# NOTE: Also need lubridate package installed, but not loading it due to +# it masking functions + +data.dir <- "" +# Input data directory here, with trailing "/" + + +con <- DBI::dbConnect(RSQLite::SQLite(), paste0(data.dir, "tx-graph-node-indices.db")) + +master.edgelist <- DBI::dbGetQuery(con, + "SELECT origin_index, destination_index,block_height,value FROM edgelist_intermediate_2") + +master.edgelist.output.created <- master.edgelist[ + (! is.na(master.edgelist$value)) & master.edgelist$value > 0 , c("destination_index", "block_height")] +colnames(master.edgelist.output.created) <- c("output_index", "output.created.block_height") +setDT(master.edgelist.output.created) + +master.edgelist.output.spent <- master.edgelist[, c("origin_index", "block_height")] +colnames(master.edgelist.output.spent) <- c("output_index", "output.spent.block_height") +setDT(master.edgelist.output.spent) + +# Only include positive _value_s for output created, since that's the value of the created output +# Then, below do an "inner merge" to get the proper outputs on the spending side + +rm(master.edgelist) +master.edgelist.output.spent <- merge(master.edgelist.output.created, master.edgelist.output.spent) +rm(master.edgelist.output.created) + + + +block.times <- readRDS(paste0(data.dir, "block_times.rds")) +colnames(block.times) <- c("output.created.block_height", "output.created.block_time") +setDT(block.times) + +master.edgelist.output.spent <- merge(master.edgelist.output.spent, block.times, by = "output.created.block_height") + +colnames(block.times) <- c("output.spent.block_height", "output.spent.block_time") + +master.edgelist.output.spent <- merge(master.edgelist.output.spent, block.times, by = "output.spent.block_height") + +# TODO: Explore phenomenon of out-of-order block timestamps and decide what to do about them + +master.edgelist.output.spent[, output.spend.age := output.spent.block_time - output.created.block_time] + +# These reduce RAM usage if desired: +# master.edgelist.output.spent[, output.created.block_time := NULL] +# master.edgelist.output.spent[, output.created.block_height := NULL] +# master.edgelist.output.spent[, output_index := NULL] + + +output.spent.block_time.intermediate <- unique(master.edgelist.output.spent[, .(output.spent.block_time)]) + +output.spent.block_time.intermediate[, output.spent.block_time.week := + lubridate::isoweek(as.POSIXct(output.spent.block_time, origin = "1970-01-01", tz = "UTC"))] + +output.spent.block_time.intermediate[, output.spent.block_time.isoweekyear := + lubridate::isoyear(as.POSIXct(output.spent.block_time, origin = "1970-01-01", tz = "UTC"))] + +output.spent.block_time.intermediate[, + output.spent.block_time.week := factor(paste0(output.spent.block_time.isoweekyear, "-", + formatC(output.spent.block_time.week, width = 2, flag = "0")))] + +master.edgelist.output.spent <- merge(master.edgelist.output.spent, + output.spent.block_time.intermediate[, .(output.spent.block_time, output.spent.block_time.week)], by = "output.spent.block_time") + +saveRDS(master.edgelist.output.spent, paste0(data.dir, "master_edgelist_output_spent.rds")) + + + diff --git a/extract-tx-graphs.R b/extract-tx-graphs.R new file mode 100644 index 0000000..00b441a --- /dev/null +++ b/extract-tx-graphs.R @@ -0,0 +1,162 @@ +# install.packages("rbch") +# install.packages("data.table") +# install.packages("future.apply") + +library(rbch) +library(data.table) +library(future.apply) + +is.dogecoin <- FALSE + +blockchain.conf.file <- "" +# Input filepath for your {blockchain}.conf file + +data.dir <- "" +# Input data directory here, with trailing "/" + +current.block.height <- NA_integer_ +# current.block.height <- rbch::getblockchaininfo(bch.config)@result$blocks + +n.threads <- min(c(6, parallelly::availableCores())) +# Recommended no more than 6 threads since all threads query the single blockchain daemon process. + +dir.create(paste0(data.dir, "tx_graphs")) + +blockchain.config <- rbch::conrpc(blockchain.conf.file) +rpcport <- readLines(blockchain.conf.file) +rpcport <- rpcport[grepl("rpcport", rpcport) ] +if (length(rpcport) > 0) { + blockchain.config@url <- paste0("http://127.0.0.1:", gsub("[^0-9]", "", rpcport)) +} + +cut.seq <- seq(20, current.block.height, by = 20) +cut.seq <- c(-1, cut.seq, current.block.height) + +heights.to.process <- 0:current.block.height +heights.to.process <- split(heights.to.process, + cut(heights.to.process, cut.seq)) + +future::plan(future::multiprocess(workers = n.threads)) + + +for (height.set in heights.to.process) { + + extracted.txs <- future.apply::future_lapply(height.set, function(iter.block.height) { + + if (iter.block.height %% 1000 == 0) { + cat(iter.block.height, base::date(), "\n") + } + + block.hash <- rbch::getblockhash(blockchain.config, iter.block.height) + if( ! is.dogecoin) { + block.data <- rbch::getblock(bch.config, blockhash = block.hash@result, verbosity = "l2") + # Argument verbose = 2 gives full transaction data + } else { + block.data <- jsonlite::fromJSON(paste0("http://localhost:22555/rest/block/", block.hash@result, ".json"), simplifyVector = FALSE) + } + #raw.txs.ls <- block.data@result$tx + raw.txs.ls <- block.data$tx + + coinbase.tx <- raw.txs.ls[[1]] + + value <- vector("numeric", length(coinbase.tx$vout) ) + + for (j in seq_along(coinbase.tx$vout)) { + value[j] <- coinbase.tx$vout[[j]]$value + } + + outgoing.coinbase <- data.table(txid = coinbase.tx$txid, + position = seq_along(coinbase.tx$vout), value = value, block_height = iter.block.height, stringsAsFactors = FALSE) + + coinbase.return.value <- list(incoming = + data.table(txid = character(0), origin.txid = character(0), + origin.position = numeric(0), block_height = integer(0), stringsAsFactors = FALSE), + outgoing = outgoing.coinbase) + + if ( length(raw.txs.ls) < 2) { + return(list(coinbase.return.value)) + } + # No incoming txs for coinbase-only + + # Results of this lapply below are returned + + return.value <- lapply(2:length(raw.txs.ls), function(iter) { + # Start at 2 since the first tx is the coinbase tx + + latest.tx <- raw.txs.ls[[iter]] + + # addresses <- vector("character", length(latest.tx$vout) ) + value <- vector("numeric", length(latest.tx$vout) ) + + for (j in seq_along(latest.tx$vout)) { + extracted.address <- latest.tx$vout[[j]]$scriptPubKey$addresses + if (length(extracted.address) > 1) { + extracted.address <- list(paste0(sort(unlist(extracted.address)), collapse = "|")) + # sort() so that the address order is always the same + } + + stopifnot(length(extracted.address[[1]]) <= 1) + if (length(extracted.address) == 0) {next} + + # addresses[j] <- extracted.address[[1]] + value[j] <- latest.tx$vout[[j]]$value + } + + outgoing <- data.table(txid = latest.tx$txid, + # address = addresses, + position = seq_along(latest.tx$vout), value = value, block_height = iter.block.height, stringsAsFactors = FALSE) + + origin.txid <- vector("character", length(latest.tx$vin) ) + origin.position <- vector("numeric", length(latest.tx$vin) ) + + for (j in seq_along(latest.tx$vin)) { + extracted.address <- latest.tx$vin[[j]]$txid + stopifnot(length(extracted.address) <= 1) + stopifnot(length(extracted.address[[1]]) <= 1) + if (length(extracted.address) == 0) {next} + + origin.txid[j] <- latest.tx$vin[[j]]$txid + origin.position[j] <- latest.tx$vin[[j]]$vout + 1 + } + + incoming <- data.table(txid = latest.tx$txid, origin.txid = origin.txid, + origin.position = origin.position, block_height = iter.block.height, stringsAsFactors = FALSE) + + list(incoming = incoming, outgoing = outgoing) + + }) + + return.value[[length(return.value) + 1]] <- coinbase.return.value + # Note that this means that coinbase txs are now "last in the block" + + return.value + + }) + + + print(object.size(extracted.txs), units = "Mb") + + extracted.txs <- unlist(extracted.txs, recursive = FALSE) + + incoming <- data.table::rbindlist(lapply(extracted.txs, function(x) { + x[[1]] + }) + ) + + outgoing <- data.table::rbindlist(lapply(extracted.txs, function(x) { + x[[2]] + }) + ) + + rm(extracted.txs) + + saveRDS(list(incoming = incoming, outgoing = outgoing), + file = paste0(data.dir, "tx_graphs/tx_graph_height_", + paste0(formatC(range(height.set), width = 7, flag = "0"), collapse = "_to_"), ".rds"), + compress = FALSE) + + rm(incoming) + rm(outgoing) + +} + diff --git a/get-block-times.R b/get-block-times.R new file mode 100644 index 0000000..0c68427 --- /dev/null +++ b/get-block-times.R @@ -0,0 +1,52 @@ +library(data.table) +library(rbch) +library(future.apply) + +is.dogecoin <- FALSE + +blockchain.conf.file <- "" +# Input filepath for your {blockchain}.conf file + +data.dir <- "" +# Input data directory here, with trailing "/" + +current.block.height <- NA_integer_ +# current.block.height <- rbch::getblockchaininfo(bch.config)@result$blocks + +n.threads <- min(c(6, parallelly::availableCores())) +# Recommended no more than 6 threads since all threads query the single blockchain daemon process. + +blockchain.config <- rbch::conrpc(blockchain.conf.file) + + +getblock.doge <- function(con, blockhash, verbosity = TRUE) { + bh <- as.character(blockhash) + pl <- unname(list(blockhash = bh, verbosity = verbosity)) + rpcpost(con, "getblock", pl) +} + + +future::plan(future::multiprocess(workers = n.threads)) + +block.times <- future.apply::future_lapply(0:current.block.height, function(iter.block.height) { + + if (iter.block.height %% 1000 == 0) { + cat(iter.block.height, base::date(), "\n") + } + + block.hash <- rbch::getblockhash(blockchain.config, iter.block.height) + + if ( ! is.dogecoin) { + block.data <- rbch::getblock(blockchain.config, blockhash = block.hash@result, verbosity = "l1") + } else { + block.data <- getblock.doge(blockchain.config, blockhash = block.hash@result, verbosity = TRUE) + } + data.frame(block_height = iter.block.height, block_time = block.data@result$time) +}) + +block.times <- data.table::rbindlist(block.times) + +saveRDS(block.times, file = paste0(data.dir, "block_times.rds")) + + +