Data Preparation

This document will go through how we prepare the data that will be used for the Shiny application.

1 Setup

1.1 Loading Packages

We will use the following packages to prepare the data.

pacman::p_load(jsonlite, knitr, tidyverse, tidygraph, igraph)

1.2 Loading Data

We will load the provided VAST Mini-Challenge 3 dataset, a json file.

mc3_data <- fromJSON("data/mc3.json")
glimpse(mc3_data)
List of 5
 $ directed  : logi TRUE
 $ multigraph: logi TRUE
 $ graph     : Named list()
 $ nodes     :'data.frame': 60520 obs. of  15 variables:
  ..$ type             : chr [1:60520] "Entity.Organization.Company" "Entity.Organization.Company" "Entity.Organization.Company" "Entity.Organization.Company" ...
  ..$ country          : chr [1:60520] "Uziland" "Mawalara" "Uzifrica" "Islavaragon" ...
  ..$ ProductServices  : chr [1:60520] "Unknown" "Furniture and home accessories" "Food products" "Unknown" ...
  ..$ PointOfContact   : chr [1:60520] "Rebecca Lewis" "Michael Lopez" "Steven Robertson" "Anthony Wyatt" ...
  ..$ HeadOfOrg        : chr [1:60520] "Émilie-Susan Benoit" "Honoré Lemoine" "Jules Labbé" "Dr. Víctor Hurtado" ...
  ..$ founding_date    : chr [1:60520] "1954-04-24T00:00:00" "2009-06-12T00:00:00" "2029-12-15T00:00:00" "1972-02-16T00:00:00" ...
  ..$ revenue          : num [1:60520] 5995 71767 0 0 4747 ...
  ..$ TradeDescription : chr [1:60520] "Unknown" "Abbott-Gomez is a leading manufacturer and supplier of high-quality furniture and home accessories, catering to"| __truncated__ "Abbott-Harrison is a leading manufacturer of high-quality food products, including baked goods, snacks, and bev"| __truncated__ "Unknown" ...
  ..$ _last_edited_by  : chr [1:60520] "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" ...
  ..$ _last_edited_date: chr [1:60520] "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
  ..$ _date_added      : chr [1:60520] "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
  ..$ _raw_source      : chr [1:60520] "Existing Corporate Structure Data" "Existing Corporate Structure Data" "Existing Corporate Structure Data" "Existing Corporate Structure Data" ...
  ..$ _algorithm       : chr [1:60520] "Automatic Import" "Automatic Import" "Automatic Import" "Automatic Import" ...
  ..$ id               : chr [1:60520] "Abbott, Mcbride and Edwards" "Abbott-Gomez" "Abbott-Harrison" "Abbott-Ibarra" ...
  ..$ dob              : chr [1:60520] NA NA NA NA ...
 $ links     :'data.frame': 75817 obs. of  11 variables:
  ..$ start_date       : chr [1:75817] "2016-10-29T00:00:00" "2035-06-03T00:00:00" "2028-11-20T00:00:00" "2024-09-04T00:00:00" ...
  ..$ type             : chr [1:75817] "Event.Owns.Shareholdership" "Event.Owns.Shareholdership" "Event.Owns.Shareholdership" "Event.Owns.Shareholdership" ...
  ..$ _last_edited_by  : chr [1:75817] "Pelagia Alethea Mordoch" "Niklaus Oberon" "Pelagia Alethea Mordoch" "Pelagia Alethea Mordoch" ...
  ..$ _last_edited_date: chr [1:75817] "2035-01-01T00:00:00" "2035-07-15T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
  ..$ _date_added      : chr [1:75817] "2035-01-01T00:00:00" "2035-07-15T00:00:00" "2035-01-01T00:00:00" "2035-01-01T00:00:00" ...
  ..$ _raw_source      : chr [1:75817] "Existing Corporate Structure Data" "Oceanus Corporations Monthly - Jun '35" "Existing Corporate Structure Data" "Existing Corporate Structure Data" ...
  ..$ _algorithm       : chr [1:75817] "Automatic Import" "Manual Entry" "Automatic Import" "Automatic Import" ...
  ..$ source           : chr [1:75817] "Avery Inc" "Berger-Hayes" "Bowers Group" "Bowman-Howe" ...
  ..$ target           : chr [1:75817] "Allen, Nichols and Thompson" "Jensen, Morris and Downs" "Barnett Inc" "Bennett Ltd" ...
  ..$ key              : int [1:75817] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ end_date         : chr [1:75817] NA NA NA NA ...

There are 60,520 nodes and 75,817 edges in the data.

2 Graph Elements

2.1 Extracting nodes and edges

We will first extract the nodes and edges.

mc3_nodes_raw <- as_tibble(mc3_data$nodes)
glimpse(mc3_nodes_raw)
Rows: 60,520
Columns: 15
$ type                <chr> "Entity.Organization.Company", "Entity.Organizatio…
$ country             <chr> "Uziland", "Mawalara", "Uzifrica", "Islavaragon", …
$ ProductServices     <chr> "Unknown", "Furniture and home accessories", "Food…
$ PointOfContact      <chr> "Rebecca Lewis", "Michael Lopez", "Steven Robertso…
$ HeadOfOrg           <chr> "Émilie-Susan Benoit", "Honoré Lemoine", "Jules La…
$ founding_date       <chr> "1954-04-24T00:00:00", "2009-06-12T00:00:00", "202…
$ revenue             <dbl> 5994.73, 71766.67, 0.00, 0.00, 4746.67, 46566.67, …
$ TradeDescription    <chr> "Unknown", "Abbott-Gomez is a leading manufacturer…
$ `_last_edited_by`   <chr> "Pelagia Alethea Mordoch", "Pelagia Alethea Mordoc…
$ `_last_edited_date` <chr> "2035-01-01T00:00:00", "2035-01-01T00:00:00", "203…
$ `_date_added`       <chr> "2035-01-01T00:00:00", "2035-01-01T00:00:00", "203…
$ `_raw_source`       <chr> "Existing Corporate Structure Data", "Existing Cor…
$ `_algorithm`        <chr> "Automatic Import", "Automatic Import", "Automatic…
$ id                  <chr> "Abbott, Mcbride and Edwards", "Abbott-Gomez", "Ab…
$ dob                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

We will only retain the following columns:

  • id: to serve as the identifier for the node
  • type: to differentiate people from companies in the graph.
  • ProductServices: to identify the products of services a business offer
mc3_nodes_lite <- mc3_nodes_raw %>%
  select(id, type, ProductServices) %>%
  rename(product_services = ProductServices)
mc3_edges_raw <- as_tibble(mc3_data$links)
glimpse(mc3_edges_raw)
Rows: 75,817
Columns: 11
$ start_date          <chr> "2016-10-29T00:00:00", "2035-06-03T00:00:00", "202…
$ type                <chr> "Event.Owns.Shareholdership", "Event.Owns.Sharehol…
$ `_last_edited_by`   <chr> "Pelagia Alethea Mordoch", "Niklaus Oberon", "Pela…
$ `_last_edited_date` <chr> "2035-01-01T00:00:00", "2035-07-15T00:00:00", "203…
$ `_date_added`       <chr> "2035-01-01T00:00:00", "2035-07-15T00:00:00", "203…
$ `_raw_source`       <chr> "Existing Corporate Structure Data", "Oceanus Corp…
$ `_algorithm`        <chr> "Automatic Import", "Manual Entry", "Automatic Imp…
$ source              <chr> "Avery Inc", "Berger-Hayes", "Bowers Group", "Bowm…
$ target              <chr> "Allen, Nichols and Thompson", "Jensen, Morris and…
$ key                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ end_date            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

We will only retain the following columns:

  • source: to identify the actor of the relationship, corresponds to id in nodes.
  • target: to identify the receiver of the relationship, corresponds to id in nodes.
  • type: to identify the type of the relationship
  • start_date: to identify when the relationship started
  • end_date: to identify when the relationship ended
mc3_edges_lite <- mc3_edges_raw %>% select(source, target, type, start_date, end_date)

2.2 Deeper look at type

Both the nodes and edges have type which contains the type of the nodes and edges. We will assign a supertype and a subtype from type.

mc3_nodes_lite %>%
  group_by(type) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()
type count
Entity.Person 50356
Entity.Organization.Company 7927
Entity.Person.CEO 1293
Entity.Organization.FishingCompany 600
Entity.Organization.LogisticsCompany 311
Entity.Organization.FinancialCompany 23
Entity.Organization.NGO 5
Entity.Organization.NewsCompany 5

supertype - type of entity, either Person or Organization

subtype - subcategory of supertype, e.g., Company, FishingCompany, CEO

mc3_edges_lite %>%
  group_by(type) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()
type count
Event.Owns.Shareholdership 39378
Event.Owns.BeneficialOwnership 21531
Event.WorksFor 14817
Relationship.FamilyRelationship 91

supertype - type of relationship, either Ownership, Employment, Relationship.

subtype - subcategory of supertype, e.g., Shareholdership, BeneficialOwnership, FamilyRelationship

2.3 Dates

Consider the date fields, e.g. start_date.

mc3_edges_lite %>% select(start_date) %>% glimpse()
Rows: 75,817
Columns: 1
$ start_date <chr> "2016-10-29T00:00:00", "2035-06-03T00:00:00", "2028-11-20T0…

It is using the ISO 8601 format that includes the time component. We are only interested in the date component so we will just get the first 10 characters.

substr("2016-10-29T00:00:00", 1, 10)
[1] "2016-10-29"

2.4 Node Aliases

As the nodes have long names, using them as labels in the visualization is not the best way as the text will cover important information.

We will generate them from the first character of each word. We will define a function to provide this capability.

to_initials <- function(name) {
  strsplit(name, "[^A-Za-z0-9']+")[[1]] %>%  # Split when non-alphanumeric
    substr(1, 1) %>% # Get first letter
    paste0(collapse = "") %>%
    substr(1, 4) # Get first 4 letters only as some names are still too long
}
to_initials("SouthSeafood Express Corp")
[1] "SEC"

2.5 Other considerations

We will add an included column to the nodes and edges for filtering purposes so that we can show or hide them depending on the filtering criteria.

This is particularly useful in network visualization.

We must also rename the columns for compatibility with igraph and tidygraph.

For edges, we will rename source and target to from and to respectively.

For nodes, we will rename id to name.

3 Preparing the Nodes

3.1 Shaping the data

We will now prepare the nodes according to the above considerations.

mc3_nodes_clean <- mc3_nodes_lite %>%
  rename(name = id) %>%
  mutate(
    alias = sapply(name, to_initials),
    supertype = strsplit(type, ".", fixed=TRUE) %>% sapply('[', 2),
    # Get the last type as subtype. In the case of Entity.Person,
    # both supertype and subtype are "Person".
    subtype = strsplit(type, ".", fixed=TRUE) %>% sapply(tail, n=1),
    included = 1
  ) %>% select(name, alias, supertype, subtype, product_services, included)

3.2 Checking the type fields

Let’s confirm if the types have been mapped correctly to the corresponding supertype and subtype.

mc3_nodes_clean %>%
  group_by(supertype, subtype) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()
supertype subtype count
Person Person 50356
Organization Company 7927
Person CEO 1293
Organization FishingCompany 600
Organization LogisticsCompany 311
Organization FinancialCompany 23
Organization NGO 5
Organization NewsCompany 5

3.3 Checking the rest of the data

Let’s also inspect the rest of the data if they are in the form we need.

mc3_nodes_clean %>% head() %>% kable()
name alias supertype subtype product_services included
Abbott, Mcbride and Edwards AMaE Organization Company Unknown 1
Abbott-Gomez AG Organization Company Furniture and home accessories 1
Abbott-Harrison AH Organization Company Food products 1
Abbott-Ibarra AI Organization Company Unknown 1
Abbott-Sullivan AS Organization Company Unknown 1
Acevedo and Sons AaS Organization Company Fish, crustaceans and molluscs 1

The alias was successfully generated based on the node name. The dataframe also has all the columns we need.

4 Preparing the Edges

4.1 Shaping the data

With the previous considerations, we will shape the edge data.

mc3_edges_clean <- mc3_edges_lite %>%
  rename(from = source, to = target, ) %>%
  mutate(
    supertype = ifelse(
      grepl("Event.Owns", type),
      "Ownership",
      ifelse(grepl("Relationship", type), "Relationship", "Employment")
    ),
    subtype = strsplit(type, ".", fixed = TRUE) %>% sapply(tail, n = 1),
    # Convert date strings to datetime
    start_date = substr(start_date, 1, 10) %>% as_date(),
    end_date = substr(end_date, 1, 10) %>% as_date(),
    included = 1
  ) %>%
  filter(from != to) %>%
  group_by(from, to, supertype, subtype, start_date, end_date, included) %>%
  summarize(weight = n())

4.2 Checking the type fields

Let’s confirm if the types have been mapped correctly to the corresponding supertype and subtype.

mc3_edges_clean %>%
  group_by(supertype, subtype) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()
supertype subtype count
Ownership Shareholdership 39378
Ownership BeneficialOwnership 21529
Employment WorksFor 14817
Relationship FamilyRelationship 91

4.3 Checking the rest of the data

Let’s also inspect the rest of the data if they are in the form we need.

mc3_edges_clean %>% head() %>% kable()
from to supertype subtype start_date end_date included weight
4. SeaCargo Ges.m.b.H. Dry CreekRybachit Marine A/S Ownership Shareholdership 2034-12-31 NA 1 1
4. SeaCargo Ges.m.b.H. KambalaSea Freight Inc Ownership Shareholdership 2033-04-12 NA 1 1
9. RiverLine CJSC SumacAmerica Transport GmbH & Co. KG Ownership Shareholdership 2028-12-02 NA 1 1
Aaron Acosta Manning-Pratt Employment WorksFor 2008-07-30 NA 1 1
Aaron Acosta Manning-Pratt Ownership Shareholdership 2008-09-14 NA 1 1
Aaron Allen Hicks-Calderon Ownership BeneficialOwnership 2025-03-06 NA 1 1

The dates columns only have the date components, not the time. The dataframe also has all the columns we need.

5 Preparing the Networks

5.1 Supernetwork

We will generate supernetwork containing all the nodes and edges we prepared.

supernetwork = tbl_graph(
  edges = mc3_edges_clean,
  nodes = mc3_nodes_clean,
  directed = TRUE
)

Let’s check the size of this network.

vcount(supernetwork)
[1] 60520
ecount(supernetwork)
[1] 75815

We will save this as an RDS file for use in the Shiny app.

write_rds(supernetwork, "data/rds/supernetwork.rds")

5.2 Filter by subnetwork

The supernetwork is very large and not suitable for visualization as it requires a lot of computing resources to visualize.

Hence, we will create a function that will enable us to focus on the network of a given node. We will define a function extract_subnetwork for this.

extract_subnetwork <- function(graph, node_name, distance=-1) {
  # negative distance will show full graph
  node <- which(V(graph)$name == node_name)
  
  if(length(node) == 0) {
    # Return empty graph
    return(tbl_graph())
  }
  
  distance <- ifelse(distance < 0, length(graph), distance)
  vertices <- ego(graph, nodes = node, order = distance)[[1]]
  igraph_subgraph <- induced_subgraph(graph, vids = vertices)
  nodes_df <- as_data_frame(igraph_subgraph, what = "vertices")
  edges_sf <- as_data_frame(igraph_subgraph, what = "edges")
  tbl_graph(nodes=nodes_df, edges=edges_sf, directed=is_directed(graph))
}
About the function

This function generates a subnetwork from a graph based on the nodes in proximity to reference node.

The size of the network can be controlled by the distance of the other nodes from the reference node. If the distance is negative, it will include all nodes connected in any way to the reference node.

It uses ego from igraph to figure out which nodes are connected within a given distance from a node.

5.3 Filter by date

Next, to enable inspecting temporal patterns, we will filter the edges and nodes based on their existence on the given date.

We will define extract_network_snapshot to enable this filtering.

extract_network_snapshot <- function(graph, datestring, delete = FALSE) {
  date <- as_date(datestring)
  
  graph_nodes = as_data_frame(graph, what = "vertices")
  graph_edges = as_data_frame(graph, what = "edges")
  
  if(is.na(date) || vcount(graph) == 0) {
    return(graph)
  }
  
  # Assume transition is at 12 AM of given date
  graph_edges <- graph_edges %>%
    mutate(
      included = ifelse(is.na(start_date) | (
        start_date <= date &
          (is.na(end_date) |
             end_date > date)
      ), 1, 0)
    )
  
  filtered_edges <- graph_edges %>% filter(included == 1)
    
  graph_nodes <- graph_nodes %>%
    mutate(included = (
      name %in% filtered_edges$from | name %in% filtered_edges$to
    ))
  
  if(!delete) {
    return(
      tbl_graph(nodes = graph_nodes,
                edges = graph_edges,
                directed = is_directed(graph))
    )
  }
  
  tbl_graph(nodes = graph_nodes %>% filter(included == 1),
            edges = filtered_edges,
            directed = is_directed(graph))
}
About the function

This function sets included to true if the edge is active during the given date. For the nodes, they are considered active if they are connected to at least one other node at that point in time.

There is also an option to delete the inactive elements altogether, which is useful to calculate measures of centrality on the network structure at that point in time.

6 Preparing the power graph

To figure out who are the most influential entities within the network, we have to look at the power dynamics in each relationship. This entails generating a graph from the relationship graph that reflects these dynamics.

6.1 Arrow direction

As we want to investigate the flow of power and resources, the direction will be from the less powerful to more powerful entity.

6.2 Weights

For the purpose of our analysis, we will score each relationship flow according to the following:

  • Benefits from the resources: 1 (+1 if they are the owner of a company)

  • A decision-maker on where resources go: 2 (+1 if they are the ultimate decision maker)

  • Influential on the other person: 1 (only if target is a person)

We will use the total as the weights for each edge. Lastly, we will rename the relationship to prevent confusion in case of direction reversal.

6.3 Score table

Beneficial owners benefit from the resources of the company.

While BeneficialOwners may or may not be Shareholders in the same company, they hold the ultimate decision-making power.

Beneficial Ownership
Current From Owner
Current To Company
New From (Lower power) Company
New To (Higher power) Owner
New subtype OwnedBy
Beneficiary 1 + 1
Decision-maker 2 + 1
Influential (on person) 0
Total 5

Shareholders may receive proceeds from the profits of the company.

They have some decision-making power within the company. However, it is the Beneficial Owner that ultimately gets the final say in decisions.

Shareholdership
Current From Shareholder
Current To Company
New From (Lower power) Company
New To (Higher power) Shareholder
New subtype HasShareholder
Beneficiary 1
Decision-maker 2
Influential (on person) 0
Total 3

Employees are contractually obligated to act in the interests of their employer.

Employers also influence what their employees do in the business.

WorksFor
Current From Owner
Current To Company
New From (Lower power) Company
New To (Higher power) Owner
New subtype OwnedBy
Beneficiary
Decision-maker 2
Influential (on person) 1
Total 3

As it is impossible to figure out the power dynamics within a family, we will retain direction and assume that the arrow points to the person that benefits more from the relationship.

FamilyRelationship
Current From Person 1
Current To Person 2
New From (Lower power) Person 1
New To (Higher power) Person 2
New subtype FamilyRelationship
Beneficiary 1
Decision-maker 0
Influential (on person) 1
Total 2
Model Revision

From the results of visualization in the original Take-home Exercise 3, the model for the power graph resulted in the influence FamilyRelationship being too strong that it overrode all power held from business activities.

We revised this to be unidirectional to lessen the impact and we cannot assume that family members have the same sharing dynamic.

6.4 Converting relationship edges to power flow

Using the table above, we will create a function to convert relationship edges to power flow. We will define a function convert_edges_to_power_flow to take care of converting edges to power graph

Show the code for convert_edges_to_power_flow()
convert_edges_to_power_flow <- function(edges) {
  # Employee -> Employer, weight: 2
  works_for <- edges %>% filter(subtype == "WorksFor") %>% mutate(weight = 3)
  
  # Person1 <-> Person2, weight: 1
  family <- edges %>% filter(subtype == "FamilyRelationship") %>% mutate(weight = 2)
  
  # Shareholder <- Company, weight: 2
  shareholder <- edges %>% filter(subtype == "Shareholdership") %>%
    mutate(
      temp = from,
      from = to,
      to = temp,
      weight = 3,
      # Rename to prevent confusion due to reversed arrows
      subtype = "HasShareholder",
    ) %>%
    select(from, to, supertype, subtype, start_date, end_date, weight, included)
  
  # BeneficialOwner <- Company, weight: 3
  owner <- edges %>% filter(subtype == "BeneficialOwnership") %>%
    mutate(
      temp = from,
      from = to,
      to = temp,
      weight = 5,
      # Rename to prevent confusion due to reversed arrows
      subtype = "OwnedBy",
    ) %>%
    select(from, to, supertype, subtype, start_date, end_date, weight, included)
  
  works_for %>%
    rbind(family) %>%
    rbind(shareholder) %>%
    rbind(owner)
}

Let us check the resulting edges table to see if each edge type has been mapped correctly.

supernetwork %>%
  as_data_frame(what = "edges") %>%
  group_by(supertype, subtype) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()
supertype subtype count
Ownership Shareholdership 39378
Ownership BeneficialOwnership 21529
Employment WorksFor 14817
Relationship FamilyRelationship 91
supernetwork %>%
  as_data_frame(what = "edges") %>%
  convert_edges_to_power_flow() %>%
  group_by(supertype, subtype) %>%
  summarize(count = n()) %>%
  arrange(-count) %>%
  kable()
supertype subtype count
Ownership HasShareholder 39378
Ownership OwnedBy 21529
Employment WorksFor 14817
Relationship FamilyRelationship 91