pacman::p_load(jsonlite, knitr, tidyverse, tidygraph, igraph)Data Preparation
This document will go through how we prepare the data that will be used for the Shiny application.
1 Setup
1.1 Loading Packages
We will use the following packages to prepare the data.
jsonlite - To parse JSON
knitr - For better table displays
tidyverse - Data science tools
tidygraph - For graph manipulations
igraph - Contains functions for network analysis
1.2 Loading Data
We will load the provided VAST Mini-Challenge 3 dataset, a json file.
mc3_data <- fromJSON("data/mc3.json")
glimpse(mc3_data)List of 5
$ directed : logi TRUE
$ multigraph: logi TRUE
$ graph : Named list()
$ nodes :'data.frame': 60520 obs. of 15 variables:
..$ type : chr [1:60520] "Entity.Organization.Company" "Entity.Organization.Company" "Entity.Organization.Company" "Entity.Organization.Company" ...
..$ country : chr [1:60520] "Uziland" "Mawalara" "Uzifrica" "Islavaragon" ...
..$ ProductServices : chr [1:60520] "Unknown" "Furniture and home accessories" "Food products" "Unknown" ...
..$ PointOfContact : chr [1:60520] "Rebecca Lewis" "Michael Lopez" "Steven Robertson" "Anthony Wyatt" ...
..$ HeadOfOrg : chr [1:60520] "Émilie-Susan Benoit" "Honoré Lemoine" "Jules Labbé" "Dr. Víctor Hurtado" ...
..$ founding_date : chr [1:60520] "1954-04-24T00:00:00" "2009-06-12T00:00:00" "2029-12-15T00:00:00" "1972-02-16T00:00:00" ...
..$ revenue : num [1:60520] 5995 71767 0 0 4747 ...
..$ TradeDescription : chr [1:60520] "Unknown" "Abbott-Gomez is a leading manufacturer and supplier of high-quality furniture and home accessories, catering to"| __truncated__ "Abbott-Harrison is a leading manufacturer of high-quality food products, including baked goods, snacks, and bev"| __truncated__ "Unknown" ...
..$ _last_edited_by : chr [1:60520] "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" ...
..$ _last_edited_date: chr [1:60520] "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
..$ _date_added : chr [1:60520] "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
..$ _raw_source : chr [1:60520] "Existing Corporate Structure Data" "Existing Corporate Structure Data" "Existing Corporate Structure Data" "Existing Corporate Structure Data" ...
..$ _algorithm : chr [1:60520] "Automatic Import" "Automatic Import" "Automatic Import" "Automatic Import" ...
..$ id : chr [1:60520] "Abbott, Mcbride and Edwards" "Abbott-Gomez" "Abbott-Harrison" "Abbott-Ibarra" ...
..$ dob : chr [1:60520] NA NA NA NA ...
$ links :'data.frame': 75817 obs. of 11 variables:
..$ start_date : chr [1:75817] "2016-10-29T00:00:00" "2035-06-03T00:00:00" "2028-11-20T00:00:00" "2024-09-04T00:00:00" ...
..$ type : chr [1:75817] "Event.Owns.Shareholdership" "Event.Owns.Shareholdership" "Event.Owns.Shareholdership" "Event.Owns.Shareholdership" ...
..$ _last_edited_by : chr [1:75817] "Pelagia Alethea Mordoch" "Niklaus Oberon" "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" ...
..$ _last_edited_date: chr [1:75817] "2035-01-01T00:00:00" "2035-07-15T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
..$ _date_added : chr [1:75817] "2035-01-01T00:00:00" "2035-07-15T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
..$ _raw_source : chr [1:75817] "Existing Corporate Structure Data" "Oceanus Corporations Monthly - Jun '35" "Existing Corporate Structure Data" "Existing Corporate Structure Data" ...
..$ _algorithm : chr [1:75817] "Automatic Import" "Manual Entry" "Automatic Import" "Automatic Import" ...
..$ source : chr [1:75817] "Avery Inc" "Berger-Hayes" "Bowers Group" "Bowman-Howe" ...
..$ target : chr [1:75817] "Allen, Nichols and Thompson" "Jensen, Morris and Downs" "Barnett Inc" "Bennett Ltd" ...
..$ key : int [1:75817] 0 0 0 0 0 0 0 0 0 0 ...
..$ end_date : chr [1:75817] NA NA NA NA ...
There are 60,520 nodes and 75,817 edges in the data.
2 Graph Elements
2.1 Extracting nodes and edges
We will first extract the nodes and edges.
mc3_nodes_raw <- as_tibble(mc3_data$nodes)
glimpse(mc3_nodes_raw)Rows: 60,520
Columns: 15
$ type <chr> "Entity.Organization.Company", "Entity.Organizatio…
$ country <chr> "Uziland", "Mawalara", "Uzifrica", "Islavaragon", …
$ ProductServices <chr> "Unknown", "Furniture and home accessories", "Food…
$ PointOfContact <chr> "Rebecca Lewis", "Michael Lopez", "Steven Robertso…
$ HeadOfOrg <chr> "Émilie-Susan Benoit", "Honoré Lemoine", "Jules La…
$ founding_date <chr> "1954-04-24T00:00:00", "2009-06-12T00:00:00", "202…
$ revenue <dbl> 5994.73, 71766.67, 0.00, 0.00, 4746.67, 46566.67, …
$ TradeDescription <chr> "Unknown", "Abbott-Gomez is a leading manufacturer…
$ `_last_edited_by` <chr> "Pelagia Alethea Mordoch", "Pelagia Alethea Mordoc…
$ `_last_edited_date` <chr> "2035-01-01T00:00:00", "2035-01-01T00:00:00", "203…
$ `_date_added` <chr> "2035-01-01T00:00:00", "2035-01-01T00:00:00", "203…
$ `_raw_source` <chr> "Existing Corporate Structure Data", "Existing Cor…
$ `_algorithm` <chr> "Automatic Import", "Automatic Import", "Automatic…
$ id <chr> "Abbott, Mcbride and Edwards", "Abbott-Gomez", "Ab…
$ dob <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
We will only retain the following columns:
- id: to serve as the identifier for the node
- type: to differentiate people from companies in the graph.
- ProductServices: to identify the products of services a business offer
mc3_nodes_lite <- mc3_nodes_raw %>%
select(id, type, ProductServices) %>%
rename(product_services = ProductServices)mc3_edges_raw <- as_tibble(mc3_data$links)
glimpse(mc3_edges_raw)Rows: 75,817
Columns: 11
$ start_date <chr> "2016-10-29T00:00:00", "2035-06-03T00:00:00", "202…
$ type <chr> "Event.Owns.Shareholdership", "Event.Owns.Sharehol…
$ `_last_edited_by` <chr> "Pelagia Alethea Mordoch", "Niklaus Oberon", "Pela…
$ `_last_edited_date` <chr> "2035-01-01T00:00:00", "2035-07-15T00:00:00", "203…
$ `_date_added` <chr> "2035-01-01T00:00:00", "2035-07-15T00:00:00", "203…
$ `_raw_source` <chr> "Existing Corporate Structure Data", "Oceanus Corp…
$ `_algorithm` <chr> "Automatic Import", "Manual Entry", "Automatic Imp…
$ source <chr> "Avery Inc", "Berger-Hayes", "Bowers Group", "Bowm…
$ target <chr> "Allen, Nichols and Thompson", "Jensen, Morris and…
$ key <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ end_date <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
We will only retain the following columns:
- source: to identify the actor of the relationship, corresponds to id in nodes.
- target: to identify the receiver of the relationship, corresponds to id in nodes.
- type: to identify the type of the relationship
- start_date: to identify when the relationship started
- end_date: to identify when the relationship ended
mc3_edges_lite <- mc3_edges_raw %>% select(source, target, type, start_date, end_date)2.2 Deeper look at type
Both the nodes and edges have type which contains the type of the nodes and edges. We will assign a supertype and a subtype from type.
mc3_nodes_lite %>%
group_by(type) %>%
summarize(count = n()) %>%
arrange(-count) %>%
kable()| type | count |
|---|---|
| Entity.Person | 50356 |
| Entity.Organization.Company | 7927 |
| Entity.Person.CEO | 1293 |
| Entity.Organization.FishingCompany | 600 |
| Entity.Organization.LogisticsCompany | 311 |
| Entity.Organization.FinancialCompany | 23 |
| Entity.Organization.NGO | 5 |
| Entity.Organization.NewsCompany | 5 |
supertype - type of entity, either Person or Organization
subtype - subcategory of supertype, e.g., Company, FishingCompany, CEO
mc3_edges_lite %>%
group_by(type) %>%
summarize(count = n()) %>%
arrange(-count) %>%
kable()| type | count |
|---|---|
| Event.Owns.Shareholdership | 39378 |
| Event.Owns.BeneficialOwnership | 21531 |
| Event.WorksFor | 14817 |
| Relationship.FamilyRelationship | 91 |
supertype - type of relationship, either Ownership, Employment, Relationship.
subtype - subcategory of supertype, e.g., Shareholdership, BeneficialOwnership, FamilyRelationship
2.3 Dates
Consider the date fields, e.g. start_date.
mc3_edges_lite %>% select(start_date) %>% glimpse()Rows: 75,817
Columns: 1
$ start_date <chr> "2016-10-29T00:00:00", "2035-06-03T00:00:00", "2028-11-20T0…
It is using the ISO 8601 format that includes the time component. We are only interested in the date component so we will just get the first 10 characters.
substr("2016-10-29T00:00:00", 1, 10)[1] "2016-10-29"
2.4 Node Aliases
As the nodes have long names, using them as labels in the visualization is not the best way as the text will cover important information.
We will generate them from the first character of each word. We will define a function to provide this capability.
to_initials <- function(name) {
strsplit(name, "[^A-Za-z0-9']+")[[1]] %>% # Split when non-alphanumeric
substr(1, 1) %>% # Get first letter
paste0(collapse = "") %>%
substr(1, 4) # Get first 4 letters only as some names are still too long
}to_initials("SouthSeafood Express Corp")[1] "SEC"
2.5 Other considerations
We will add an included column to the nodes and edges for filtering purposes so that we can show or hide them depending on the filtering criteria.
This is particularly useful in network visualization.
We must also rename the columns for compatibility with igraph and tidygraph.
For edges, we will rename source and target to from and to respectively.
For nodes, we will rename id to name.
3 Preparing the Nodes
3.1 Shaping the data
We will now prepare the nodes according to the above considerations.
mc3_nodes_clean <- mc3_nodes_lite %>%
rename(name = id) %>%
mutate(
alias = sapply(name, to_initials),
supertype = strsplit(type, ".", fixed=TRUE) %>% sapply('[', 2),
# Get the last type as subtype. In the case of Entity.Person,
# both supertype and subtype are "Person".
subtype = strsplit(type, ".", fixed=TRUE) %>% sapply(tail, n=1),
included = 1
) %>% select(name, alias, supertype, subtype, product_services, included)3.2 Checking the type fields
Let’s confirm if the types have been mapped correctly to the corresponding supertype and subtype.
mc3_nodes_clean %>%
group_by(supertype, subtype) %>%
summarize(count = n()) %>%
arrange(-count) %>%
kable()| supertype | subtype | count |
|---|---|---|
| Person | Person | 50356 |
| Organization | Company | 7927 |
| Person | CEO | 1293 |
| Organization | FishingCompany | 600 |
| Organization | LogisticsCompany | 311 |
| Organization | FinancialCompany | 23 |
| Organization | NGO | 5 |
| Organization | NewsCompany | 5 |
3.3 Checking the rest of the data
Let’s also inspect the rest of the data if they are in the form we need.
mc3_nodes_clean %>% head() %>% kable()| name | alias | supertype | subtype | product_services | included |
|---|---|---|---|---|---|
| Abbott, Mcbride and Edwards | AMaE | Organization | Company | Unknown | 1 |
| Abbott-Gomez | AG | Organization | Company | Furniture and home accessories | 1 |
| Abbott-Harrison | AH | Organization | Company | Food products | 1 |
| Abbott-Ibarra | AI | Organization | Company | Unknown | 1 |
| Abbott-Sullivan | AS | Organization | Company | Unknown | 1 |
| Acevedo and Sons | AaS | Organization | Company | Fish, crustaceans and molluscs | 1 |
The alias was successfully generated based on the node name. The dataframe also has all the columns we need.
4 Preparing the Edges
4.1 Shaping the data
With the previous considerations, we will shape the edge data.
mc3_edges_clean <- mc3_edges_lite %>%
rename(from = source, to = target, ) %>%
mutate(
supertype = ifelse(
grepl("Event.Owns", type),
"Ownership",
ifelse(grepl("Relationship", type), "Relationship", "Employment")
),
subtype = strsplit(type, ".", fixed = TRUE) %>% sapply(tail, n = 1),
# Convert date strings to datetime
start_date = substr(start_date, 1, 10) %>% as_date(),
end_date = substr(end_date, 1, 10) %>% as_date(),
included = 1
) %>%
filter(from != to) %>%
group_by(from, to, supertype, subtype, start_date, end_date, included) %>%
summarize(weight = n())4.2 Checking the type fields
Let’s confirm if the types have been mapped correctly to the corresponding supertype and subtype.
mc3_edges_clean %>%
group_by(supertype, subtype) %>%
summarize(count = n()) %>%
arrange(-count) %>%
kable()| supertype | subtype | count |
|---|---|---|
| Ownership | Shareholdership | 39378 |
| Ownership | BeneficialOwnership | 21529 |
| Employment | WorksFor | 14817 |
| Relationship | FamilyRelationship | 91 |
4.3 Checking the rest of the data
Let’s also inspect the rest of the data if they are in the form we need.
mc3_edges_clean %>% head() %>% kable()| from | to | supertype | subtype | start_date | end_date | included | weight |
|---|---|---|---|---|---|---|---|
| 4. SeaCargo Ges.m.b.H. | Dry CreekRybachit Marine A/S | Ownership | Shareholdership | 2034-12-31 | NA | 1 | 1 |
| 4. SeaCargo Ges.m.b.H. | KambalaSea Freight Inc | Ownership | Shareholdership | 2033-04-12 | NA | 1 | 1 |
| 9. RiverLine CJSC | SumacAmerica Transport GmbH & Co. KG | Ownership | Shareholdership | 2028-12-02 | NA | 1 | 1 |
| Aaron Acosta | Manning-Pratt | Employment | WorksFor | 2008-07-30 | NA | 1 | 1 |
| Aaron Acosta | Manning-Pratt | Ownership | Shareholdership | 2008-09-14 | NA | 1 | 1 |
| Aaron Allen | Hicks-Calderon | Ownership | BeneficialOwnership | 2025-03-06 | NA | 1 | 1 |
The dates columns only have the date components, not the time. The dataframe also has all the columns we need.
5 Preparing the Networks
5.1 Supernetwork
We will generate supernetwork containing all the nodes and edges we prepared.
supernetwork = tbl_graph(
edges = mc3_edges_clean,
nodes = mc3_nodes_clean,
directed = TRUE
)Let’s check the size of this network.
vcount(supernetwork)[1] 60520
ecount(supernetwork)[1] 75815
We will save this as an RDS file for use in the Shiny app.
write_rds(supernetwork, "data/rds/supernetwork.rds")5.2 Filter by subnetwork
The supernetwork is very large and not suitable for visualization as it requires a lot of computing resources to visualize.
Hence, we will create a function that will enable us to focus on the network of a given node. We will define a function extract_subnetwork for this.
extract_subnetwork <- function(graph, node_name, distance=-1) {
# negative distance will show full graph
node <- which(V(graph)$name == node_name)
if(length(node) == 0) {
# Return empty graph
return(tbl_graph())
}
distance <- ifelse(distance < 0, length(graph), distance)
vertices <- ego(graph, nodes = node, order = distance)[[1]]
igraph_subgraph <- induced_subgraph(graph, vids = vertices)
nodes_df <- as_data_frame(igraph_subgraph, what = "vertices")
edges_sf <- as_data_frame(igraph_subgraph, what = "edges")
tbl_graph(nodes=nodes_df, edges=edges_sf, directed=is_directed(graph))
}This function generates a subnetwork from a graph based on the nodes in proximity to reference node.
The size of the network can be controlled by the distance of the other nodes from the reference node. If the distance is negative, it will include all nodes connected in any way to the reference node.
It uses ego from igraph to figure out which nodes are connected within a given distance from a node.
5.3 Filter by date
Next, to enable inspecting temporal patterns, we will filter the edges and nodes based on their existence on the given date.
We will define extract_network_snapshot to enable this filtering.
extract_network_snapshot <- function(graph, datestring, delete = FALSE) {
date <- as_date(datestring)
graph_nodes = as_data_frame(graph, what = "vertices")
graph_edges = as_data_frame(graph, what = "edges")
if(is.na(date) || vcount(graph) == 0) {
return(graph)
}
# Assume transition is at 12 AM of given date
graph_edges <- graph_edges %>%
mutate(
included = ifelse(is.na(start_date) | (
start_date <= date &
(is.na(end_date) |
end_date > date)
), 1, 0)
)
filtered_edges <- graph_edges %>% filter(included == 1)
graph_nodes <- graph_nodes %>%
mutate(included = (
name %in% filtered_edges$from | name %in% filtered_edges$to
))
if(!delete) {
return(
tbl_graph(nodes = graph_nodes,
edges = graph_edges,
directed = is_directed(graph))
)
}
tbl_graph(nodes = graph_nodes %>% filter(included == 1),
edges = filtered_edges,
directed = is_directed(graph))
}This function sets included to true if the edge is active during the given date. For the nodes, they are considered active if they are connected to at least one other node at that point in time.
There is also an option to delete the inactive elements altogether, which is useful to calculate measures of centrality on the network structure at that point in time.
6 Preparing the power graph
To figure out who are the most influential entities within the network, we have to look at the power dynamics in each relationship. This entails generating a graph from the relationship graph that reflects these dynamics.
6.1 Arrow direction
As we want to investigate the flow of power and resources, the direction will be from the less powerful to more powerful entity.
6.2 Weights
For the purpose of our analysis, we will score each relationship flow according to the following:
Benefits from the resources: 1 (+1 if they are the owner of a company)
A decision-maker on where resources go: 2 (+1 if they are the ultimate decision maker)
Influential on the other person: 1 (only if target is a person)
We will use the total as the weights for each edge. Lastly, we will rename the relationship to prevent confusion in case of direction reversal.
6.3 Score table
Beneficial owners benefit from the resources of the company.
While BeneficialOwners may or may not be Shareholders in the same company, they hold the ultimate decision-making power.
| Current From | Owner |
| Current To | Company |
| New From (Lower power) | Company |
| New To (Higher power) | Owner |
| New subtype | OwnedBy |
| Beneficiary | 1 + 1 |
| Decision-maker | 2 + 1 |
| Influential (on person) | 0 |
| Total | 5 |
Shareholders may receive proceeds from the profits of the company.
They have some decision-making power within the company. However, it is the Beneficial Owner that ultimately gets the final say in decisions.
| Current From | Shareholder |
| Current To | Company |
| New From (Lower power) | Company |
| New To (Higher power) | Shareholder |
| New subtype | HasShareholder |
| Beneficiary | 1 |
| Decision-maker | 2 |
| Influential (on person) | 0 |
| Total | 3 |
Employees are contractually obligated to act in the interests of their employer.
Employers also influence what their employees do in the business.
| Current From | Owner |
| Current To | Company |
| New From (Lower power) | Company |
| New To (Higher power) | Owner |
| New subtype | OwnedBy |
| Beneficiary | |
| Decision-maker | 2 |
| Influential (on person) | 1 |
| Total | 3 |
As it is impossible to figure out the power dynamics within a family, we will retain direction and assume that the arrow points to the person that benefits more from the relationship.
| Current From | Person 1 |
| Current To | Person 2 |
| New From (Lower power) | Person 1 |
| New To (Higher power) | Person 2 |
| New subtype | FamilyRelationship |
| Beneficiary | 1 |
| Decision-maker | 0 |
| Influential (on person) | 1 |
| Total | 2 |
From the results of visualization in the original Take-home Exercise 3, the model for the power graph resulted in the influence FamilyRelationship being too strong that it overrode all power held from business activities.
We revised this to be unidirectional to lessen the impact and we cannot assume that family members have the same sharing dynamic.
6.4 Converting relationship edges to power flow
Using the table above, we will create a function to convert relationship edges to power flow. We will define a function convert_edges_to_power_flow to take care of converting edges to power graph
Show the code for convert_edges_to_power_flow()
convert_edges_to_power_flow <- function(edges) {
# Employee -> Employer, weight: 2
works_for <- edges %>% filter(subtype == "WorksFor") %>% mutate(weight = 3)
# Person1 <-> Person2, weight: 1
family <- edges %>% filter(subtype == "FamilyRelationship") %>% mutate(weight = 2)
# Shareholder <- Company, weight: 2
shareholder <- edges %>% filter(subtype == "Shareholdership") %>%
mutate(
temp = from,
from = to,
to = temp,
weight = 3,
# Rename to prevent confusion due to reversed arrows
subtype = "HasShareholder",
) %>%
select(from, to, supertype, subtype, start_date, end_date, weight, included)
# BeneficialOwner <- Company, weight: 3
owner <- edges %>% filter(subtype == "BeneficialOwnership") %>%
mutate(
temp = from,
from = to,
to = temp,
weight = 5,
# Rename to prevent confusion due to reversed arrows
subtype = "OwnedBy",
) %>%
select(from, to, supertype, subtype, start_date, end_date, weight, included)
works_for %>%
rbind(family) %>%
rbind(shareholder) %>%
rbind(owner)
}Let us check the resulting edges table to see if each edge type has been mapped correctly.
supernetwork %>%
as_data_frame(what = "edges") %>%
group_by(supertype, subtype) %>%
summarize(count = n()) %>%
arrange(-count) %>%
kable()| supertype | subtype | count |
|---|---|---|
| Ownership | Shareholdership | 39378 |
| Ownership | BeneficialOwnership | 21529 |
| Employment | WorksFor | 14817 |
| Relationship | FamilyRelationship | 91 |
supernetwork %>%
as_data_frame(what = "edges") %>%
convert_edges_to_power_flow() %>%
group_by(supertype, subtype) %>%
summarize(count = n()) %>%
arrange(-count) %>%
kable()| supertype | subtype | count |
|---|---|---|
| Ownership | HasShareholder | 39378 |
| Ownership | OwnedBy | 21529 |
| Employment | WorksFor | 14817 |
| Relationship | FamilyRelationship | 91 |