**The three level Sankey Diagram**

I wanted to give you a detailed theory of how the thing works in the last blog. To make the explanation clear, I gave you a tool that is somewhat limiting. But it is easier to follow along than a 3 level diagram.

The way that you make a 3 level diagram is to have a left, middle, and right column of data. Then you make the connections between left and middle, then middle and right. Then you plot it. So, rather than tell you again how it works, I'm just going to give you the code so that you have it and can play with it. Start up RStudio, open a new file, copy the following into it. I will assume at this point you know how to make the audit.csv file.

library(plyr)

library(dplyr)

library(networkD3)

# This script will make a 3 level Sankey diagram

left_field <- as.character("EVENT")

right_field <- as.character("RESULT")

middle_field <- as.character("OBJ_KIND")

# Subset the data based on this expression

operand1 <- as.character("")

operation <- as.character("")

operand2 <- as.character("")

expr <- paste(operand1, operation, operand2)

# Read in the data and don't let strings become factors

audit <- read.csv("~/R/audit-data/audit.csv", header=TRUE, stringsAsFactors = FALSE)

audit$one <- rep(1,nrow(audit))

# Subset the audit information depending on the expression

if (expr != " ") {

audit <- filter_(audit, expr)

}

# Make 2 dataframes for a 3 level Sankey

left = data.frame(audit[left_field], audit[middle_field], audit$one)

colnames(left) = c("Source", "Target", "Num")

right = data.frame(audit[middle_field], audit[right_field], audit$one)

colnames(right) = c("Source", "Target", "Num")

# Now summarize and collapse to unique values to calculate edges

l <- ddply(left, .(Source,Target), summarize, Value=sum(Num))

r <- ddply(right, .(Source,Target), summarize, Value=sum(Num))

# Calculate Nodes lookup table

nodes <- c(as.character(l$Source), as.character(l$Target), as.character(r$Target))

nodes <- data.frame(unique(as.factor(nodes)))

colnames(nodes) = c("Source")

nodes$ID <- seq.int(from = 0, to = nrow(nodes) - 1)

nodes <- nodes[,c("ID","Source")]

# Now map Node lookup table numbers to source and target

# Merge index onto Source

l_edges <- merge(l, nodes, by.x = "Source")

l_edges$source = l_edges$ID

r_edges <- merge(r, nodes, by.x = "Source")

r_edges$source = r_edges$ID

# Merge index onto Target

names(nodes) = c("ID", "Target")

l_edges2 <- l_edges[,c("Source","Target","Value","source")]

r_edges2 <- r_edges[,c("Source","Target","Value","source")]

l_edges <- merge(l_edges2, nodes, by.x = "Target")

r_edges <- merge(r_edges2, nodes, by.x = "Target")

# rename everything so its nice and neat

names(l_edges) <- c("osrc", "otgt", "value", "source", "target")

names(r_edges) <- c("osrc", "otgt", "value", "source", "target")

names(nodes) = c("ID", "name")

# Combine into one big final data frame

edges <- rbind(l_edges, r_edges)

sankeyNetwork(Links = edges, Nodes = nodes,

Source = "source", Target = "target",

Value = "value", NodeID = "name",

fontSize = 16, nodeWidth = 30,

height = 2000, width = 1800)

library(dplyr)

library(networkD3)

# This script will make a 3 level Sankey diagram

left_field <- as.character("EVENT")

right_field <- as.character("RESULT")

middle_field <- as.character("OBJ_KIND")

# Subset the data based on this expression

operand1 <- as.character("")

operation <- as.character("")

operand2 <- as.character("")

expr <- paste(operand1, operation, operand2)

# Read in the data and don't let strings become factors

audit <- read.csv("~/R/audit-data/audit.csv", header=TRUE, stringsAsFactors = FALSE)

audit$one <- rep(1,nrow(audit))

# Subset the audit information depending on the expression

if (expr != " ") {

audit <- filter_(audit, expr)

}

# Make 2 dataframes for a 3 level Sankey

left = data.frame(audit[left_field], audit[middle_field], audit$one)

colnames(left) = c("Source", "Target", "Num")

right = data.frame(audit[middle_field], audit[right_field], audit$one)

colnames(right) = c("Source", "Target", "Num")

# Now summarize and collapse to unique values to calculate edges

l <- ddply(left, .(Source,Target), summarize, Value=sum(Num))

r <- ddply(right, .(Source,Target), summarize, Value=sum(Num))

# Calculate Nodes lookup table

nodes <- c(as.character(l$Source), as.character(l$Target), as.character(r$Target))

nodes <- data.frame(unique(as.factor(nodes)))

colnames(nodes) = c("Source")

nodes$ID <- seq.int(from = 0, to = nrow(nodes) - 1)

nodes <- nodes[,c("ID","Source")]

# Now map Node lookup table numbers to source and target

# Merge index onto Source

l_edges <- merge(l, nodes, by.x = "Source")

l_edges$source = l_edges$ID

r_edges <- merge(r, nodes, by.x = "Source")

r_edges$source = r_edges$ID

# Merge index onto Target

names(nodes) = c("ID", "Target")

l_edges2 <- l_edges[,c("Source","Target","Value","source")]

r_edges2 <- r_edges[,c("Source","Target","Value","source")]

l_edges <- merge(l_edges2, nodes, by.x = "Target")

r_edges <- merge(r_edges2, nodes, by.x = "Target")

# rename everything so its nice and neat

names(l_edges) <- c("osrc", "otgt", "value", "source", "target")

names(r_edges) <- c("osrc", "otgt", "value", "source", "target")

names(nodes) = c("ID", "name")

# Combine into one big final data frame

edges <- rbind(l_edges, r_edges)

sankeyNetwork(Links = edges, Nodes = nodes,

Source = "source", Target = "target",

Value = "value", NodeID = "name",

fontSize = 16, nodeWidth = 30,

height = 2000, width = 1800)

There are 3 variables left_field, middle_field, and right_field that defines what will get charted. This program has them set to "EVENT", "OBJ_KIND", and "RESULT". (If you need a reminder of what fields are available and what they mean, see this blog post.) This shows you what kind of objects are associated with the event and what the final result is. This is what it looks like on my system:

I would also recommend you play around with it. I would also suggest trying these:

ACTION

RESULT

OBJ_KIND

HOW

OBJ_KIND

RESULT

SUBJ_PRIME

RESULT

OBJ_KIND

EVENT

ACTION

OBJ_KIND

You should play around with it. You might see something in your events you hadn't noticed before. One last tip, too much data makes it hard to see things. You might experiment with subsetting the information either by adding command line switches to ausearch so that it narrows down what's collected or by using R code.

## No comments:

Post a Comment