Data Preparation

This document will go through how we prepare the data that will be used for the Shiny application.

1 Setup

1.1 Loading Packages

We will use the following packages to prepare the data.

jsonlite - To parse JSON
knitr - For better table displays
tidyverse - Data science tools
tidygraph - For graph manipulations
igraph - Contains functions for network analysis

pacman::p_load(jsonlite, knitr, tidyverse, tidygraph, igraph)

1.2 Loading Data

We will load the provided VAST Mini-Challenge 3 dataset, a json file.

mc3_data <- fromJSON("data/mc3.json")
glimpse(mc3_data)

List of 5
 $ directed  : logi TRUE
 $ multigraph: logi TRUE
 $ graph     : Named list()
 $ nodes     :'data.frame': 60520 obs. of  15 variables:
  ..$ type             : chr [1:60520] "Entity.Organization.Company" "Entity.Organization.Company" "Entity.Organization.Company" "Entity.Organization.Company" ...
  ..$ country          : chr [1:60520] "Uziland" "Mawalara" "Uzifrica" "Islavaragon" ...
  ..$ ProductServices  : chr [1:60520] "Unknown" "Furniture and home accessories" "Food products" "Unknown" ...
  ..$ PointOfContact   : chr [1:60520] "Rebecca Lewis" "Michael Lopez" "Steven Robertson" "Anthony Wyatt" ...
  ..$ HeadOfOrg        : chr [1:60520] "Émilie-Susan Benoit" "Honoré Lemoine" "Jules Labbé" "Dr. Víctor Hurtado" ...
  ..$ founding_date    : chr [1:60520] "1954-04-24T00:00:00" "2009-06-12T00:00:00" "2029-12-15T00:00:00" "1972-02-16T00:00:00" ...
  ..$ revenue          : num [1:60520] 5995 71767 0 0 4747 ...
  ..$ TradeDescription : chr [1:60520] "Unknown" "Abbott-Gomez is a leading manufacturer and supplier of high-quality furniture and home accessories, catering to"| __truncated__ "Abbott-Harrison is a leading manufacturer of high-quality food products, including baked goods, snacks, and bev"| __truncated__ "Unknown" ...
  ..$ _last_edited_by  : chr [1:60520] "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" ...
  ..$ _last_edited_date: chr [1:60520] "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
  ..$ _date_added      : chr [1:60520] "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
  ..$ _raw_source      : chr [1:60520] "Existing Corporate Structure Data" "Existing Corporate Structure Data" "Existing Corporate Structure Data" "Existing Corporate Structure Data" ...
  ..$ _algorithm       : chr [1:60520] "Automatic Import" "Automatic Import" "Automatic Import" "Automatic Import" ...
  ..$ id               : chr [1:60520] "Abbott, Mcbride and Edwards" "Abbott-Gomez" "Abbott-Harrison" "Abbott-Ibarra" ...
  ..$ dob              : chr [1:60520] NA NA NA NA ...
 $ links     :'data.frame': 75817 obs. of  11 variables:
  ..$ start_date       : chr [1:75817] "2016-10-29T00:00:00" "2035-06-03T00:00:00" "2028-11-20T00:00:00" "2024-09-04T00:00:00" ...
  ..$ type             : chr [1:75817] "Event.Owns.Shareholdership" "Event.Owns.Shareholdership" "Event.Owns.Shareholdership" "Event.Owns.Shareholdership" ...
  ..$ _last_edited_by  : chr [1:75817] "Pelagia Alethea Mordoch" "Niklaus Oberon" "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" ...
  ..$ _last_edited_date: chr [1:75817] "2035-01-01T00:00:00" "2035-07-15T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
  ..$ _date_added      : chr [1:75817] "2035-01-01T00:00:00" "2035-07-15T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
  ..$ _raw_source      : chr [1:75817] "Existing Corporate Structure Data" "Oceanus Corporations Monthly - Jun '35" "Existing Corporate Structure Data" "Existing Corporate Structure Data" ...
  ..$ _algorithm       : chr [1:75817] "Automatic Import" "Manual Entry" "Automatic Import" "Automatic Import" ...
  ..$ source           : chr [1:75817] "Avery Inc" "Berger-Hayes" "Bowers Group" "Bowman-Howe" ...
  ..$ target           : chr [1:75817] "Allen, Nichols and Thompson" "Jensen, Morris and Downs" "Barnett Inc" "Bennett Ltd" ...
  ..$ key              : int [1:75817] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ end_date         : chr [1:75817] NA NA NA NA ...

There are 60,520 nodes and 75,817 edges in the data.

2 Graph Elements

2.1 Extracting nodes and edges

We will first extract the nodes and edges.

Nodes
Edges

mc3_nodes_raw <- as_tibble(mc3_data$nodes)
glimpse(mc3_nodes_raw)

Rows: 60,520
Columns: 15
$ type                <chr> "Entity.Organization.Company", "Entity.Organizatio…
$ country             <chr> "Uziland", "Mawalara", "Uzifrica", "Islavaragon", …
$ ProductServices     <chr> "Unknown", "Furniture and home accessories", "Food…
$ PointOfContact      <chr> "Rebecca Lewis", "Michael Lopez", "Steven Robertso…
$ HeadOfOrg           <chr> "Émilie-Susan Benoit", "Honoré Lemoine", "Jules La…
$ founding_date       <chr> "1954-04-24T00:00:00", "2009-06-12T00:00:00", "202…
$ revenue             <dbl> 5994.73, 71766.67, 0.00, 0.00, 4746.67, 46566.67, …
$ TradeDescription    <chr> "Unknown", "Abbott-Gomez is a leading manufacturer…
$ `_last_edited_by`   <chr> "Pelagia Alethea Mordoch", "Pelagia Alethea Mordoc…
$ `_last_edited_date` <chr> "2035-01-01T00:00:00", "2035-01-01T00:00:00", "203…
$ `_date_added`       <chr> "2035-01-01T00:00:00", "2035-01-01T00:00:00", "203…
$ `_raw_source`       <chr> "Existing Corporate Structure Data", "Existing Cor…
$ `_algorithm`        <chr> "Automatic Import", "Automatic Import", "Automatic…
$ id                  <chr> "Abbott, Mcbride and Edwards", "Abbott-Gomez", "Ab…
$ dob                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

We will only retain the following columns:

id: to serve as the identifier for the node
type: to differentiate people from companies in the graph.
ProductServices: to identify the products of services a business offer

mc3_nodes_lite <- mc3_nodes_raw %>%
  select(id, type, ProductServices) %>%
  rename(product_services = ProductServices)

mc3_edges_raw <- as_tibble(mc3_data$links)
glimpse(mc3_edges_raw)

Rows: 75,817
Columns: 11
$ start_date          <chr> "2016-10-29T00:00:00", "2035-06-03T00:00:00", "202…
$ type                <chr> "Event.Owns.Shareholdership", "Event.Owns.Sharehol…
$ `_last_edited_by`   <chr> "Pelagia Alethea Mordoch", "Niklaus Oberon", "Pela…
$ `_last_edited_date` <chr> "2035-01-01T00:00:00", "2035-07-15T00:00:00", "203…
$ `_date_added`       <chr> "2035-01-01T00:00:00", "2035-07-15T00:00:00", "203…
$ `_raw_source`       <chr> "Existing Corporate Structure Data", "Oceanus Corp…
$ `_algorithm`        <chr> "Automatic Import", "Manual Entry", "Automatic Imp…
$ source              <chr> "Avery Inc", "Berger-Hayes", "Bowers Group", "Bowm…
$ target              <chr> "Allen, Nichols and Thompson", "Jensen, Morris and…
$ key                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ end_date            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

We will only retain the following columns:

source: to identify the actor of the relationship, corresponds to id in nodes.
target: to identify the receiver of the relationship, corresponds to id in nodes.
type: to identify the type of the relationship
start_date: to identify when the relationship started
end_date: to identify when the relationship ended

mc3_edges_lite <- mc3_edges_raw %>% select(source, target, type, start_date, end_date)

2.2 Deeper look at type

Both the nodes and edges have type which contains the type of the nodes and edges. We will assign a supertype and a subtype from type.

Nodes
Edges

mc3_nodes_lite %>%
  group_by(type) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()

type	count
Entity.Person	50356
Entity.Organization.Company	7927
Entity.Person.CEO	1293
Entity.Organization.FishingCompany	600
Entity.Organization.LogisticsCompany	311
Entity.Organization.FinancialCompany	23
Entity.Organization.NGO	5
Entity.Organization.NewsCompany	5

supertype - type of entity, either Person or Organization

subtype - subcategory of supertype, e.g., Company, FishingCompany, CEO

mc3_edges_lite %>%
  group_by(type) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()

type	count
Event.Owns.Shareholdership	39378
Event.Owns.BeneficialOwnership	21531
Event.WorksFor	14817
Relationship.FamilyRelationship	91

supertype - type of relationship, either Ownership, Employment, Relationship.

subtype - subcategory of supertype, e.g., Shareholdership, BeneficialOwnership, FamilyRelationship

2.3 Dates

Consider the date fields, e.g. start_date.

mc3_edges_lite %>% select(start_date) %>% glimpse()

Rows: 75,817
Columns: 1
$ start_date <chr> "2016-10-29T00:00:00", "2035-06-03T00:00:00", "2028-11-20T0…

It is using the ISO 8601 format that includes the time component. We are only interested in the date component so we will just get the first 10 characters.

substr("2016-10-29T00:00:00", 1, 10)

[1] "2016-10-29"

2.4 Node Aliases

As the nodes have long names, using them as labels in the visualization is not the best way as the text will cover important information.

We will generate them from the first character of each word. We will define a function to provide this capability.

to_initials <- function(name) {
  strsplit(name, "[^A-Za-z0-9']+")[[1]] %>%  # Split when non-alphanumeric
    substr(1, 1) %>% # Get first letter
    paste0(collapse = "") %>%
    substr(1, 4) # Get first 4 letters only as some names are still too long
}

to_initials("SouthSeafood Express Corp")

[1] "SEC"

We will add an included column to the nodes and edges for filtering purposes so that we can show or hide them depending on the filtering criteria.

This is particularly useful in network visualization.

We must also rename the columns for compatibility with igraph and tidygraph.

For edges, we will rename source and target to from and to respectively.

For nodes, we will rename id to name.

3 Preparing the Nodes

3.1 Shaping the data

We will now prepare the nodes according to the above considerations.

mc3_nodes_clean <- mc3_nodes_lite %>%
  rename(name = id) %>%
  mutate(
    alias = sapply(name, to_initials),
    supertype = strsplit(type, ".", fixed=TRUE) %>% sapply('[', 2),
    # Get the last type as subtype. In the case of Entity.Person,
    # both supertype and subtype are "Person".
    subtype = strsplit(type, ".", fixed=TRUE) %>% sapply(tail, n=1),
    included = 1
  ) %>% select(name, alias, supertype, subtype, product_services, included)

3.2 Checking the type fields

Let’s confirm if the types have been mapped correctly to the corresponding supertype and subtype.

mc3_nodes_clean %>%
  group_by(supertype, subtype) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()

supertype	subtype	count
Person	Person	50356
Organization	Company	7927
Person	CEO	1293
Organization	FishingCompany	600
Organization	LogisticsCompany	311
Organization	FinancialCompany	23
Organization	NGO	5
Organization	NewsCompany	5

3.3 Checking the rest of the data

Let’s also inspect the rest of the data if they are in the form we need.

mc3_nodes_clean %>% head() %>% kable()

name	alias	supertype	subtype	product_services	included
Abbott, Mcbride and Edwards	AMaE	Organization	Company	Unknown	1
Abbott-Gomez	AG	Organization	Company	Furniture and home accessories	1
Abbott-Harrison	AH	Organization	Company	Food products	1
Abbott-Ibarra	AI	Organization	Company	Unknown	1
Abbott-Sullivan	AS	Organization	Company	Unknown	1
Acevedo and Sons	AaS	Organization	Company	Fish, crustaceans and molluscs	1

The alias was successfully generated based on the node name. The dataframe also has all the columns we need.

4 Preparing the Edges

4.1 Shaping the data

With the previous considerations, we will shape the edge data.

mc3_edges_clean <- mc3_edges_lite %>%
  rename(from = source, to = target, ) %>%
  mutate(
    supertype = ifelse(
      grepl("Event.Owns", type),
      "Ownership",
      ifelse(grepl("Relationship", type), "Relationship", "Employment")
    ),
    subtype = strsplit(type, ".", fixed = TRUE) %>% sapply(tail, n = 1),
    # Convert date strings to datetime
    start_date = substr(start_date, 1, 10) %>% as_date(),
    end_date = substr(end_date, 1, 10) %>% as_date(),
    included = 1
  ) %>%
  filter(from != to) %>%
  group_by(from, to, supertype, subtype, start_date, end_date, included) %>%
  summarize(weight = n())

4.2 Checking the type fields

Let’s confirm if the types have been mapped correctly to the corresponding supertype and subtype.

mc3_edges_clean %>%
  group_by(supertype, subtype) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()

supertype	subtype	count
Ownership	Shareholdership	39378
Ownership	BeneficialOwnership	21529
Employment	WorksFor	14817
Relationship	FamilyRelationship	91

4.3 Checking the rest of the data

Let’s also inspect the rest of the data if they are in the form we need.

mc3_edges_clean %>% head() %>% kable()

from	to	supertype	subtype	start_date	end_date	included	weight
4. SeaCargo Ges.m.b.H.	Dry CreekRybachit Marine A/S	Ownership	Shareholdership	2034-12-31	NA	1	1
4. SeaCargo Ges.m.b.H.	KambalaSea Freight Inc	Ownership	Shareholdership	2033-04-12	NA	1	1
9. RiverLine CJSC	SumacAmerica Transport GmbH & Co. KG	Ownership	Shareholdership	2028-12-02	NA	1	1
Aaron Acosta	Manning-Pratt	Employment	WorksFor	2008-07-30	NA	1	1
Aaron Acosta	Manning-Pratt	Ownership	Shareholdership	2008-09-14	NA	1	1
Aaron Allen	Hicks-Calderon	Ownership	BeneficialOwnership	2025-03-06	NA	1	1

The dates columns only have the date components, not the time. The dataframe also has all the columns we need.

5 Preparing the Networks

5.1 Supernetwork

We will generate supernetwork containing all the nodes and edges we prepared.

supernetwork = tbl_graph(
  edges = mc3_edges_clean,
  nodes = mc3_nodes_clean,
  directed = TRUE
)

Let’s check the size of this network.

Node Count
Edge Count

vcount(supernetwork)

[1] 60520

ecount(supernetwork)

[1] 75815

We will save this as an RDS file for use in the Shiny app.

write_rds(supernetwork, "data/rds/supernetwork.rds")

5.2 Filter by subnetwork

The supernetwork is very large and not suitable for visualization as it requires a lot of computing resources to visualize.

Hence, we will create a function that will enable us to focus on the network of a given node. We will define a function extract_subnetwork for this.

extract_subnetwork <- function(graph, node_name, distance=-1) {
  # negative distance will show full graph
  node <- which(V(graph)$name == node_name)
  
  if(length(node) == 0) {
    # Return empty graph
    return(tbl_graph())
  }
  
  distance <- ifelse(distance < 0, length(graph), distance)
  vertices <- ego(graph, nodes = node, order = distance)[[1]]
  igraph_subgraph <- induced_subgraph(graph, vids = vertices)
  nodes_df <- as_data_frame(igraph_subgraph, what = "vertices")
  edges_sf <- as_data_frame(igraph_subgraph, what = "edges")
  tbl_graph(nodes=nodes_df, edges=edges_sf, directed=is_directed(graph))
}

About the function

This function generates a subnetwork from a graph based on the nodes in proximity to reference node.

The size of the network can be controlled by the distance of the other nodes from the reference node. If the distance is negative, it will include all nodes connected in any way to the reference node.

It uses ego from igraph to figure out which nodes are connected within a given distance from a node.

5.3 Filter by date

Next, to enable inspecting temporal patterns, we will filter the edges and nodes based on their existence on the given date.

We will define extract_network_snapshot to enable this filtering.

extract_network_snapshot <- function(graph, datestring, delete = FALSE) {
  date <- as_date(datestring)
  
  graph_nodes = as_data_frame(graph, what = "vertices")
  graph_edges = as_data_frame(graph, what = "edges")
  
  if(is.na(date) || vcount(graph) == 0) {
    return(graph)
  }
  
  # Assume transition is at 12 AM of given date
  graph_edges <- graph_edges %>%
    mutate(
      included = ifelse(is.na(start_date) | (
        start_date <= date &
          (is.na(end_date) |
             end_date > date)
      ), 1, 0)
    )
  
  filtered_edges <- graph_edges %>% filter(included == 1)
    
  graph_nodes <- graph_nodes %>%
    mutate(included = (
      name %in% filtered_edges$from | name %in% filtered_edges$to
    ))
  
  if(!delete) {
    return(
      tbl_graph(nodes = graph_nodes,
                edges = graph_edges,
                directed = is_directed(graph))
    )
  }
  
  tbl_graph(nodes = graph_nodes %>% filter(included == 1),
            edges = filtered_edges,
            directed = is_directed(graph))
}

About the function

This function sets included to true if the edge is active during the given date. For the nodes, they are considered active if they are connected to at least one other node at that point in time.

There is also an option to delete the inactive elements altogether, which is useful to calculate measures of centrality on the network structure at that point in time.

6 Preparing the power graph

To figure out who are the most influential entities within the network, we have to look at the power dynamics in each relationship. This entails generating a graph from the relationship graph that reflects these dynamics.

6.1 Arrow direction

As we want to investigate the flow of power and resources, the direction will be from the less powerful to more powerful entity.

6.2 Weights

For the purpose of our analysis, we will score each relationship flow according to the following:

Benefits from the resources: 1 (+1 if they are the owner of a company)
A decision-maker on where resources go: 2 (+1 if they are the ultimate decision maker)
Influential on the other person: 1 (only if target is a person)

We will use the total as the weights for each edge. Lastly, we will rename the relationship to prevent confusion in case of direction reversal.

6.3 Score table

Beneficial owners benefit from the resources of the company.

While BeneficialOwners may or may not be Shareholders in the same company, they hold the ultimate decision-making power.

Beneficial Ownership
Current From	Owner
Current To	Company
New From (Lower power)	Company
New To (Higher power)	Owner
New subtype	OwnedBy
Beneficiary	1 + 1
Decision-maker	2 + 1
Influential (on person)	0
Total	5

Shareholders may receive proceeds from the profits of the company.

They have some decision-making power within the company. However, it is the Beneficial Owner that ultimately gets the final say in decisions.

Shareholdership
Current From	Shareholder
Current To	Company
New From (Lower power)	Company
New To (Higher power)	Shareholder
New subtype	HasShareholder
Beneficiary	1
Decision-maker	2
Influential (on person)	0
Total	3

Employees are contractually obligated to act in the interests of their employer.

Employers also influence what their employees do in the business.

WorksFor
Current From	Owner
Current To	Company
New From (Lower power)	Company
New To (Higher power)	Owner
New subtype	OwnedBy
Beneficiary
Decision-maker	2
Influential (on person)	1
Total	3

As it is impossible to figure out the power dynamics within a family, we will retain direction and assume that the arrow points to the person that benefits more from the relationship.

FamilyRelationship
Current From	Person 1
Current To	Person 2
New From (Lower power)	Person 1
New To (Higher power)	Person 2
New subtype	FamilyRelationship
Beneficiary	1
Decision-maker	0
Influential (on person)	1
Total	2

Model Revision

From the results of visualization in the original Take-home Exercise 3, the model for the power graph resulted in the influence FamilyRelationship being too strong that it overrode all power held from business activities.

We revised this to be unidirectional to lessen the impact and we cannot assume that family members have the same sharing dynamic.

6.4 Converting relationship edges to power flow

Using the table above, we will create a function to convert relationship edges to power flow. We will define a function convert_edges_to_power_flow to take care of converting edges to power graph

Show the code for convert_edges_to_power_flow()

convert_edges_to_power_flow <- function(edges) {
  # Employee -> Employer, weight: 2
  works_for <- edges %>% filter(subtype == "WorksFor") %>% mutate(weight = 3)
  
  # Person1 <-> Person2, weight: 1
  family <- edges %>% filter(subtype == "FamilyRelationship") %>% mutate(weight = 2)
  
  # Shareholder <- Company, weight: 2
  shareholder <- edges %>% filter(subtype == "Shareholdership") %>%
    mutate(
      temp = from,
      from = to,
      to = temp,
      weight = 3,
      # Rename to prevent confusion due to reversed arrows
      subtype = "HasShareholder",
    ) %>%
    select(from, to, supertype, subtype, start_date, end_date, weight, included)
  
  # BeneficialOwner <- Company, weight: 3
  owner <- edges %>% filter(subtype == "BeneficialOwnership") %>%
    mutate(
      temp = from,
      from = to,
      to = temp,
      weight = 5,
      # Rename to prevent confusion due to reversed arrows
      subtype = "OwnedBy",
    ) %>%
    select(from, to, supertype, subtype, start_date, end_date, weight, included)
  
  works_for %>%
    rbind(family) %>%
    rbind(shareholder) %>%
    rbind(owner)
}

Let us check the resulting edges table to see if each edge type has been mapped correctly.

Relationship
Power

supernetwork %>%
  as_data_frame(what = "edges") %>%
  group_by(supertype, subtype) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()

supertype	subtype	count
Ownership	Shareholdership	39378
Ownership	BeneficialOwnership	21529
Employment	WorksFor	14817
Relationship	FamilyRelationship	91

supernetwork %>%
  as_data_frame(what = "edges") %>%
  convert_edges_to_power_flow() %>%
  group_by(supertype, subtype) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()

supertype	subtype	count
Ownership	HasShareholder	39378
Ownership	OwnedBy	21529
Employment	WorksFor	14817
Relationship	FamilyRelationship	91