• Stratos Gounidellis - DS3517005
  • Natasa Farmaki - DS3517018
In [2]:
# import necessary libraries
from scipy.sparse.csgraph import floyd_warshall
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random

Data loading & graph creation

In [3]:
# create the undirected graph
g = nx.read_edgelist("facebook_combined.txt",
                     create_using=nx.Graph(), nodetype=int)
In [4]:
# print the graph info
g = g.to_undirected()
print(nx.info(g))
Name: 
Type: Graph
Number of nodes: 4039
Number of edges: 88234
Average degree:  43.6910
In [5]:
# convert also a directed instance of a graph in 
# case it is needed in any of the functionalities later
# and print the graph  info. The number of edges is doubled.
g_dir = g.to_directed()
print(nx.info(g_dir))
Name: 
Type: DiGraph
Number of nodes: 4039
Number of edges: 176468
Average in degree:  43.6910
Average out degree:  43.6910

Algorithm functions

In [6]:
# Baseline approach: randomly choose friend recommendations for the
# facebook users. The reccomendations are sorted based on the nodeid
# Obviously, this is a completely naive approach and should not have
# high accuracy.
def baseline(G, node1, k):
    nodeid = random.sample(G.nodes(), k)
    friend_rec = pd.DataFrame({"nodeid": nodeid})
    friend_rec = friend_rec.sort_values(by=['nodeid'], ascending=[True])
    friend_rec = friend_rec.reset_index(drop=True)
    friend_rec.nodeid = friend_rec.nodeid.astype(int)
    return friend_rec
In [7]:
# Common neighbors approach: according to this method the best friend reccomendation
# is the facebook user with whom you have the greatest number
# of mutual friends. In the case of ties in friendship score the node with the smallest
# nodeID is shown first.
def common_neighbors(G, node1, k):
    nodeid = []
    number_of_common_neighbors = []
    for i in G.nodes():
        if not G.has_edge(node1, i) and node1 != i:
            nodeid.append(i)
            number_of_common_neighbors.append(len(list(nx.common_neighbors(G,
                                                                           node1,
                                                                           i))))
    friend_rec = pd.DataFrame({"nodeid": nodeid,
                               "number_of_common_neighbors": number_of_common_neighbors})
    friend_rec = friend_rec.sort_values(by=['number_of_common_neighbors',
                                            'nodeid'], ascending=[False, True])
    friend_rec = friend_rec.head(k)
    friend_rec = friend_rec.reset_index(drop=True)
    return friend_rec
In [8]:
# Return a set with the common friends of two facebook users.
def union_neighbors(G, node1, node2):
    return set(nx.all_neighbors(G, node1)) | set(nx.all_neighbors(G, node2))

# Jaccard coefficient approach: The Jaccard coefficient is a metric used to
# evaluate similarities. The Jaccard coefficient measures the similarity between 
# sets and is defined as the size of the intersection divided by the size 
# of the union. In the case of ties in friendship score the node with the smallest
# nodeID is shown first.
def jaccard_coef(G, node1, k):
    nodeid = []
    jaccard_coef = []
    for i in G.nodes():
        if not G.has_edge(node1, i) and node1 != i:
            nodeid.append(i)
            union_neigh = float(len(union_neighbors(G, node1, i)))
            jaccard_coef.append(len(list(nx.common_neighbors(G, node1, i)))/union_neigh)
    friend_rec = pd.DataFrame({"nodeid": nodeid,
                               "jaccard_coef": jaccard_coef})
    friend_rec = friend_rec.sort_values(by=['jaccard_coef', 'nodeid'],
                                        ascending=[False, True])
    friend_rec = friend_rec.head(k)
    friend_rec = friend_rec.reset_index(drop=True)
    friend_rec.nodeid = friend_rec.nodeid.astype(int)
    cols = friend_rec.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    friend_rec = friend_rec[cols]
    return friend_rec
In [9]:
# Return the number of friends of facebook user.
def find_neighbors(G, common_neighbor):
    return len(list(nx.all_neighbors(G, common_neighbor)))

# Return the required logarithmic sum for the Adamic and Adar
# similarity function.
def calc_log_sum(G, common_neighbors):
    log_sum = 0.0
    for common_neighbor in common_neighbors:
        log_sum = log_sum + 1 / math.log(find_neighbors(G,
                                                        common_neighbor), 10)
    return log_sum

# Adamic and Adar function: Adamic and Adar proposed a weighted function for
# scoring similarity between users (nodes) in a network. The more similar a user
# is to another, the more likely a friendship will occur in the network. In the
# case of ties in friendship score the node with the smallest nodeID is shown first.
def adamic_adar(G, node1, k):
    nodeid = []
    adamic_adar_similarity = []
    for i in G.nodes():
        if not G.has_edge(node1, i) and node1 != i:
            nodeid.append(i)
            common_neighbors = nx.common_neighbors(G, node1, i)
            adamic_adar_similarity.append(calc_log_sum(G, common_neighbors))
    friend_rec = pd.DataFrame({"nodeid": nodeid,
                               "adamic_adar_similarity": adamic_adar_similarity})
    friend_rec = friend_rec.sort_values(by=['adamic_adar_similarity',
                                            'nodeid'],
                                        ascending=[False, True])

    friend_rec = friend_rec.head(k)
    friend_rec = friend_rec.reset_index(drop=True)
    friend_rec.nodeid = friend_rec.nodeid.astype(int)

    cols = friend_rec.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    friend_rec = friend_rec[cols]

    return friend_rec
In [10]:
# compute the adjacency matrix of the undirected graph
A = nx.adjacency_matrix(g, nodelist=sorted(g.nodes()))

# compiute the distance matrix of the graph. The algorithm used
# is floyd_warshall while the distance matrix will be used for the
# metrics presented and implemented below.
distance_array = floyd_warshall(A, directed=False, unweighted=True)
In [11]:
# Graph distance approach: according to this approach the best friend 
# reccomendation is the facebook user that is closer to you. In the
# case of ties in friendship score the node with the smallest nodeID 
# is shown first.
def graph_distance(G, node1, distance_array, k):
    nodeid = []
    graph_distance_similarity = []
    for i in G.nodes():
        if not G.has_edge(node1, i) and node1 != i:
            nodeid.append(i)
            distance = distance_array[node1, i]
            graph_distance_similarity.append(float(distance))

    friend_rec = pd.DataFrame({"nodeid": nodeid,
                               "graph_distance_similarity": graph_distance_similarity})
    friend_rec = friend_rec.sort_values(by=['graph_distance_similarity',
                                            'nodeid'], ascending=[True, True])

    friend_rec = friend_rec.head(k)
    friend_rec = friend_rec.reset_index(drop=True)
    friend_rec.nodeid = friend_rec.nodeid.astype(int)

    cols = friend_rec.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    friend_rec = friend_rec[cols]

    return friend_rec
In [12]:
# Weighted Graph Distance (FoF): this approach takes into account not only
# the distance among two facebook users but also their common friends.
# In that sense, it could be considered a weighted graph distance metric
# with the common neighbors as a weighting factor. In the
# case of ties in friendship score the node with the smallest nodeID 
# is shown first.
def distance_common_neigh(G, node1, distance_array, k):
    nodeid = []
    graph_distance_similarity = []
    for i in G.nodes():
        if not G.has_edge(node1, i) and node1 != i:
            nodeid.append(i)
            common_neighbors = len(list(nx.common_neighbors(G, node1, i)))
            distance = distance_array[node1, i]
            graph_distance_similarity.append(common_neighbors/float(distance))

    friend_rec = pd.DataFrame({"nodeid": nodeid,
                               "graph_distance_similarity": graph_distance_similarity})
    friend_rec = friend_rec.sort_values(by=['graph_distance_similarity',
                                            'nodeid'], ascending=[False, True])

    friend_rec = friend_rec.head(k)
    friend_rec = friend_rec.reset_index(drop=True)
    friend_rec.nodeid = friend_rec.nodeid.astype(int)

    cols = friend_rec.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    friend_rec = friend_rec[cols]

    return friend_rec
In [13]:
# Weighted Graph Distance (Jaccard): this approach takes into account not only
# the distance among two facebook users but also the jaccard coefficient.
# In that sense, it could be considered a weighted graph distance metric
# with the jaccard coefficient as a weighting factor. In the
# case of ties in friendship score the node with the smallest nodeID 
# is shown first.
def distance_jaccard(G, node1, distance_array, k):
    nodeid = []
    distance_jaccard_similarity = []
    for i in G.nodes():
        if not G.has_edge(node1, i) and node1 != i:
            nodeid.append(i)
            union_neigh = float(len(union_neighbors(G, node1, i)))
            common_neighbors = len(list(nx.common_neighbors(G, node1, i)))
            jaccard_coef = common_neighbors / union_neigh
            distance = distance_array[node1, i]
            distance_jaccard_similarity.append(jaccard_coef/distance)

    friend_rec = pd.DataFrame({"nodeid": nodeid,
                               "distance_jaccard_similarity": distance_jaccard_similarity})
    friend_rec = friend_rec.sort_values(by=['distance_jaccard_similarity',
                                            'nodeid'], ascending=[False, True])

    friend_rec = friend_rec.head(k)
    friend_rec = friend_rec.reset_index(drop=True)
    friend_rec.nodeid = friend_rec.nodeid.astype(int)

    cols = friend_rec.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    friend_rec = friend_rec[cols]

    return friend_rec

Indicative Outputs

The ouput of the aformentioned metrics is presented for the facebook users with nodeId [107, 1126, 14, 35].

In [14]:
fb_users = [107, 1126, 14, 35]
In [15]:
for fb_user in fb_users:
    print("Current facebook user:", fb_user)
    display(common_neighbors(g, fb_user, 10))
Current facebook user: 107
nodeid number_of_common_neighbors
0 513 19
1 400 18
2 559 18
3 373 17
4 492 17
5 500 17
6 378 16
7 436 16
8 431 15
9 514 15
Current facebook user: 1126
nodeid number_of_common_neighbors
0 916 113
1 1238 103
2 1750 99
3 1230 98
4 1004 94
5 1791 94
6 1530 90
7 1172 89
8 1570 85
9 1597 84
Current facebook user: 14
nodeid number_of_common_neighbors
0 2 9
1 17 9
2 140 9
3 111 8
4 137 7
5 162 7
6 19 6
7 333 6
8 44 5
9 243 4
Current facebook user: 35
nodeid number_of_common_neighbors
0 46 2
1 68 2
2 99 2
3 131 2
4 175 2
5 177 2
6 225 2
7 227 2
8 278 2
9 321 2
In [17]:
for fb_user in fb_users:
    print("Current facebook user:", fb_user)
    display(jaccard_coef(g, fb_user, 10))
Current facebook user: 107
nodeid jaccard_coef
0 513 0.017040
1 400 0.016364
2 559 0.016216
3 492 0.015554
4 500 0.015302
5 373 0.015152
6 436 0.014692
7 378 0.014665
8 515 0.013749
9 514 0.013711
Current facebook user: 1126
nodeid jaccard_coef
0 916 0.487069
1 1750 0.440000
2 1230 0.435556
3 1530 0.422535
4 1004 0.419643
5 1238 0.417004
6 1172 0.400901
7 1791 0.400000
8 1789 0.374429
9 1597 0.371681
Current facebook user: 14
nodeid jaccard_coef
0 2 0.562500
1 140 0.529412
2 17 0.473684
3 162 0.437500
4 111 0.380952
5 333 0.352941
6 44 0.312500
7 137 0.291667
8 19 0.240000
9 243 0.210526
Current facebook user: 35
nodeid jaccard_coef
0 321 0.666667
1 11 0.500000
2 12 0.500000
3 15 0.500000
4 18 0.500000
5 37 0.500000
6 43 0.500000
7 74 0.500000
8 114 0.500000
9 209 0.500000
In [18]:
for fb_user in fb_users:
    print("Current facebook user:", fb_user)
    display(adamic_adar(g, fb_user, 10))
Current facebook user: 107
nodeid adamic_adar_similarity
0 513 9.724953
1 400 9.307010
2 559 8.818876
3 500 8.528682
4 492 8.373501
5 373 8.368363
6 378 7.852810
7 436 7.813676
8 524 7.441609
9 514 7.437235
Current facebook user: 1126
nodeid adamic_adar_similarity
0 916 53.556246
1 1238 48.753746
2 1750 46.496906
3 1230 45.811896
4 1004 44.041808
5 1791 43.959211
6 1530 41.808500
7 1172 41.579656
8 1570 40.006415
9 1597 39.146933
Current facebook user: 14
nodeid adamic_adar_similarity
0 2 6.843324
1 140 6.736628
2 17 6.736628
3 111 5.964252
4 162 5.214517
5 137 5.073642
6 333 4.598281
7 19 4.190559
8 44 3.471818
9 243 2.722615
Current facebook user: 35
nodeid adamic_adar_similarity
0 46 1.320278
1 68 1.320278
2 99 1.320278
3 131 1.320278
4 175 1.320278
5 177 1.320278
6 225 1.320278
7 227 1.320278
8 278 1.320278
9 321 1.320278
In [22]:
for fb_user in fb_users:
    print("Current facebook user:", fb_user)
    display(graph_distance(g, fb_user, distance_array, 10))
Current facebook user: 107
nodeid graph_distance_similarity
0 1 2.0
1 2 2.0
2 3 2.0
3 4 2.0
4 5 2.0
5 6 2.0
6 7 2.0
7 8 2.0
8 9 2.0
9 10 2.0
Current facebook user: 1126
nodeid graph_distance_similarity
0 0 2.0
1 58 2.0
2 136 2.0
3 171 2.0
4 348 2.0
5 353 2.0
6 363 2.0
7 366 2.0
8 376 2.0
9 389 2.0
Current facebook user: 14
nodeid graph_distance_similarity
0 1 2.0
1 2 2.0
2 3 2.0
3 4 2.0
4 5 2.0
5 6 2.0
6 7 2.0
7 8 2.0
8 9 2.0
9 10 2.0
Current facebook user: 35
nodeid graph_distance_similarity
0 1 2.0
1 2 2.0
2 3 2.0
3 4 2.0
4 5 2.0
5 6 2.0
6 7 2.0
7 8 2.0
8 9 2.0
9 10 2.0
In [20]:
for fb_user in fb_users:
    print("Current facebook user:", fb_user)
    display(distance_common_neigh(g, fb_user, distance_array, 10))
Current facebook user: 107
nodeid graph_distance_similarity
0 513 9.5
1 400 9.0
2 559 9.0
3 373 8.5
4 492 8.5
5 500 8.5
6 378 8.0
7 436 8.0
8 431 7.5
9 514 7.5
Current facebook user: 1126
nodeid graph_distance_similarity
0 916 56.5
1 1238 51.5
2 1750 49.5
3 1230 49.0
4 1004 47.0
5 1791 47.0
6 1530 45.0
7 1172 44.5
8 1570 42.5
9 1597 42.0
Current facebook user: 14
nodeid graph_distance_similarity
0 2 4.5
1 17 4.5
2 140 4.5
3 111 4.0
4 137 3.5
5 162 3.5
6 19 3.0
7 333 3.0
8 44 2.5
9 243 2.0
Current facebook user: 35
nodeid graph_distance_similarity
0 46 1.0
1 68 1.0
2 99 1.0
3 131 1.0
4 175 1.0
5 177 1.0
6 225 1.0
7 227 1.0
8 278 1.0
9 321 1.0
In [21]:
for fb_user in fb_users:
    print("Current facebook user:", fb_user)
    display(distance_jaccard(g, fb_user, distance_array, 10))
Current facebook user: 107
nodeid distance_jaccard_similarity
0 513 0.008520
1 400 0.008182
2 559 0.008108
3 492 0.007777
4 500 0.007651
5 373 0.007576
6 436 0.007346
7 378 0.007333
8 515 0.006874
9 514 0.006856
Current facebook user: 1126
nodeid distance_jaccard_similarity
0 916 0.243534
1 1750 0.220000
2 1230 0.217778
3 1530 0.211268
4 1004 0.209821
5 1238 0.208502
6 1172 0.200450
7 1791 0.200000
8 1789 0.187215
9 1597 0.185841
Current facebook user: 14
nodeid distance_jaccard_similarity
0 2 0.281250
1 140 0.264706
2 17 0.236842
3 162 0.218750
4 111 0.190476
5 333 0.176471
6 44 0.156250
7 137 0.145833
8 19 0.120000
9 243 0.105263
Current facebook user: 35
nodeid distance_jaccard_similarity
0 321 0.333333
1 11 0.250000
2 12 0.250000
3 15 0.250000
4 18 0.250000
5 37 0.250000
6 43 0.250000
7 74 0.250000
8 114 0.250000
9 209 0.250000

Measures to evaluate metrics' performance

In [23]:
# Examine whether the scoring functions give different
# recommendations for specific users. We consider only the 
# facebook user with a nodeId, which is a multiple of 10, i.e.
# 40 users in total. The distance based approaches seem to be
# completely different from the other.
def compare_algos(G, distance_array, k):
    same_rec = 0
    dif_rec = 0
    
    fof_jaccard = 0.0
    jaccard_adamic = 0.0
    fof_adamic = 0.0
    
    fof_distance = 0.0
    fof_distance_common = 0.0
    fof_distance_jaccard = 0.0
    
    jaccard_distance = 0.0
    jaccard_distance_common = 0.0
    jaccard_distance_jaccard = 0.0
    
    adamic_distance = 0.0
    adamic_distance_common = 0.0
    adamic_distance_jaccard = 0.0

    for user in range(100, len(G.nodes()), 100):
        fof_metric = sorted(common_neighbors(G, user, k).nodeid.values)
        jaccard_metric = sorted(jaccard_coef(G, user, k).nodeid.values)
        adamic_adar_metric = sorted(adamic_adar(G, user, k).nodeid.values)
        distance_metric = sorted(graph_distance(G, user, distance_array, k).nodeid.values)
        distance_common_metric = sorted(distance_common_neigh(G, user, distance_array, k).nodeid.values)
        distance_jaccard_metric = sorted(distance_jaccard(G, user, distance_array, k).nodeid.values)
        
        bool_first = (fof_metric == jaccard_metric) and (jaccard_metric == adamic_adar_metric) and (fof_metric == adamic_adar_metric)
        bool_sec = (fof_metric == distance_metric) and (jaccard_metric == distance_common_metric) and (distance_jaccard_metric == adamic_adar_metric)
        bool_third = (fof_metric == distance_common_metric) and (jaccard_metric == distance_jaccard_metric) and (distance_metric == adamic_adar_metric)
        bool_fourth = (fof_metric == distance_jaccard_metric) and (jaccard_metric == distance_metric) and (distance_common_metric == adamic_adar_metric)

        if bool_first and bool_sec and bool_third and bool_fourth:
            same_rec += 1
        else:
            dif_rec += 1
        fof_jaccard += len(set(fof_metric) & set(jaccard_metric)) / float(len(set(fof_metric) | set(jaccard_metric)))
        jaccard_adamic += len(set(adamic_adar_metric) & set(jaccard_metric)) / float(len(set(adamic_adar_metric) | set(jaccard_metric)))
        fof_adamic += len(set(fof_metric) & set(adamic_adar_metric)) / float(len(set(fof_metric) | set(adamic_adar_metric)))
        
        fof_distance = len(set(fof_metric) & set(distance_metric)) / float(len(set(fof_metric) | set(distance_metric)))
        fof_distance_common = len(set(fof_metric) & set(distance_common_metric)) / float(len(set(fof_metric) | set(distance_common_metric)))
        fof_distance_jaccard = len(set(fof_metric) & set(distance_jaccard_metric)) / float(len(set(fof_metric) | set(distance_jaccard_metric)))
        
        jaccard_distance = len(set(jaccard_metric) & set(distance_metric)) / float(len(set(jaccard_metric) | set(distance_metric)))
        jaccard_distance_common = len(set(jaccard_metric) & set(distance_common_metric)) / float(len(set(jaccard_metric) | set(distance_common_metric)))
        jaccard_distance_jaccard = len(set(jaccard_metric) & set(distance_jaccard_metric)) / float(len(set(jaccard_metric) | set(distance_jaccard_metric)))
        
        adamic_distance = len(set(adamic_adar_metric) & set(distance_metric)) / float(len(set(adamic_adar_metric) | set(distance_metric)))
        adamic_distance_common = len(set(adamic_adar_metric) & set(distance_common_metric)) / float(len(set(adamic_adar_metric) | set(distance_common_metric)))
        adamic_distance_jaccard = len(set(adamic_adar_metric) & set(distance_jaccard_metric)) / float(len(set(adamic_adar_metric) | set(distance_jaccard_metric)))

        
    fof_jaccard /= 40
    jaccard_adamic /= 40
    fof_adamic /= 40
    fof_distance /= 40
    fof_distance_common /= 40
    fof_distance_jaccard /= 40
    jaccard_distance /= 40
    jaccard_distance_common /= 40
    jaccard_distance_jaccard /= 40
    adamic_distance /= 40
    adamic_distance_common /= 40
    adamic_distance_jaccard /= 40

    average_similarity = float(fof_jaccard + jaccard_adamic + fof_adamic + fof_distance + fof_distance_common +
                               fof_distance_jaccard + jaccard_distance + jaccard_distance_common + jaccard_distance_jaccard +
                               adamic_distance + adamic_distance_common + adamic_distance_jaccard) / 12
    
    print("Users with same recommendations:", same_rec)
    print("Users with different recommendations:", dif_rec)
    print()
    
    print("FoF - Jaccard similarity percentage:", fof_jaccard)
    print("Jaccard - Adamic similarity percentage:", jaccard_adamic)
    print("FoF - Adamic similarity percentage:", fof_adamic)
    print()
    
    print("FoF - Distance similarity percentage:", fof_distance)
    print("FoF - Weighted Distance (FoF) similarity percentage:", fof_distance_common)
    print("FoF - Weighted Distance (Jaccard) similarity percentage:", fof_distance_jaccard)
    print()
    
    print("Jaccard - Distance similarity percentage:", jaccard_distance)
    print("Jaccard - Weighted Distance (FoF) similarity percentage:", jaccard_distance_common)
    print("Jaccard - Weighted Distance (Jaccard) similarity percentage:", jaccard_distance_jaccard)
    print()
    
    print("Adamic - Distance similarity percentage:", adamic_distance)
    print("Adamic - Weighted Distance (FoF) similarity percentage:", adamic_distance_common)
    print("Adamic - Weighted Distance (Jaccard) similarity percentage:", adamic_distance_jaccard)
    print()

    print("Average similarity between the algorithms:", average_similarity)
    
compare_algos(g, distance_array, 10)
            
Users with same recommendations: 0
Users with different recommendations: 40

FoF - Jaccard similarity percentage: 0.4460914880613022
Jaccard - Adamic similarity percentage: 0.45298849327487084
FoF - Adamic similarity percentage: 0.8567927170868345

FoF - Distance similarity percentage: 0.00625
FoF - Weighted Distance (FoF) similarity percentage: 0.025
FoF - Weighted Distance (Jaccard) similarity percentage: 0.020454545454545454

Jaccard - Distance similarity percentage: 0.00625
Jaccard - Weighted Distance (FoF) similarity percentage: 0.020454545454545454
Jaccard - Weighted Distance (Jaccard) similarity percentage: 0.025

Adamic - Distance similarity percentage: 0.00625
Adamic - Weighted Distance (FoF) similarity percentage: 0.025
Adamic - Weighted Distance (Jaccard) similarity percentage: 0.020454545454545454

Average similarity between the algorithms: 0.15924886123222035
In [24]:
# Return a set of random edges of the graph.
# The number of the required edges has to be specified.
def random_choices(G, n):
    edges = random.sample(G.edges(), n)
    return edges

# Return a set of random edges which can be
# used for the evaluation of the six different metrics.
# The same set of random edges is used for the evaluation of the
# algorithms,to prevent different random choices from skewing
# the results.
def random_set_edges(G, k):
    random_edges = random_choices(G, len(G.edges()))
    selected_edges = []
    counter = 0
    for edge in random_edges:
        G.remove_edge(*edge)
    
        metric_f1_fof = common_neighbors(G, edge[0], k).nodeid.values
        metric_f2_fof = common_neighbors(G, edge[1], k).nodeid.values

        metric_f1_jaccard = jaccard_coef(G, edge[0], k).nodeid.values
        metric_f2_jaccard = jaccard_coef(G, edge[1], k).nodeid.values

        metric_f1_adamic = adamic_adar(G, edge[0], k).nodeid.values
        metric_f2_adamic = adamic_adar(G, edge[1], k).nodeid.values

        metric_f1_distance_common = distance_common_neigh(G, edge[0], distance_array, k).nodeid.values
        metric_f2_distance_common = distance_common_neigh(G, edge[1], distance_array, k).nodeid.values
        
        metric_f1_distance_jaccard = distance_jaccard(G, edge[0], distance_array, k).nodeid.values
        metric_f2_distance_jaccard = distance_jaccard(G, edge[1], distance_array, k).nodeid.values

        metric_f1_distance = graph_distance(G, edge[0], distance_array, k).nodeid.values
        metric_f2_distance = graph_distance(G, edge[1], distance_array, k).nodeid.values
            
        if edge[1] not in metric_f1_fof or edge[0] not in metric_f2_fof:
            G.add_edge(*edge)
            continue

        if edge[1] not in metric_f1_jaccard or edge[0] not in metric_f2_jaccard:
            G.add_edge(*edge)
            continue
            
        if edge[1] not in metric_f1_adamic or edge[0] not in metric_f2_adamic:
            G.add_edge(*edge)
            continue

        if edge[1] not in metric_f1_distance or edge[0] not in metric_f2_distance:
            G.add_edge(*edge)
            continue
            
        if edge[1] not in metric_f1_distance_jaccard or edge[0] not in metric_f2_distance_jaccard:
            G.add_edge(*edge)
            continue
            
        if edge[1] not in metric_f1_distance_common or edge[0] not in metric_f2_distance_common:
            G.add_edge(*edge)
            continue
            
        counter += 1
        selected_edges.append(edge)
        G.add_edge(*edge)
        
        if counter == 100:
            break

    return selected_edges


# Evaluate the performance of each algorithm
def eval_algos(G, algorithm, edges, distance_array, k):
    count_fails = 0
    avg_rank = 0
    for edge in edges:
        G.remove_edge(*edge)
    
        if algorithm == "fof":
            metric_f1 = common_neighbors(G, edge[0], k).nodeid.values
            metric_f2 = common_neighbors(G, edge[1], k).nodeid.values
        elif algorithm == "jaccard":
            metric_f1 = jaccard_coef(G, edge[0], k).nodeid.values
            metric_f2 = jaccard_coef(G, edge[1], k).nodeid.values
        elif algorithm == "adamic_adar":
            metric_f1 = adamic_adar(G, edge[0], k).nodeid.values
            metric_f2 = adamic_adar(G, edge[1], k).nodeid.values
        elif algorithm == "baseline":
            metric_f1 = baseline(G, edge[0], k).nodeid.values
            metric_f2 = baseline(G, edge[1], k).nodeid.values
        elif algorithm == "distance_common_neigh":
            metric_f1 = distance_common_neigh(G, edge[0], distance_array, k).nodeid.values
            metric_f2 = distance_common_neigh(G, edge[1], distance_array, k).nodeid.values
        elif algorithm == "distance_jaccard":
            metric_f1 = distance_jaccard(G, edge[0], distance_array, k).nodeid.values
            metric_f2 = distance_jaccard(G, edge[1], distance_array, k).nodeid.values
        elif algorithm == "distance":
            metric_f1 = graph_distance(G, edge[0], distance_array, k).nodeid.values
            metric_f2 = graph_distance(G, edge[1], distance_array, k).nodeid.values
            
        if edge[1] not in metric_f1 or edge[0] not in metric_f2:
            G.add_edge(*edge)
            count_fails += 1
            continue
        
        # add one to the rank to be greater than zero
        rank_f2inf1 = np.where(metric_f1 == edge[1])[0][0] + 1
        rank_f1inf2 = np.where(metric_f2 == edge[0])[0][0] + 1
        # get the avrage rank
        avg_rank += float(rank_f2inf1 + rank_f1inf2) / 2
        G.add_edge(*edge)

    if (100 - count_fails) != 0:
        avg_rank /= float(100 - count_fails)
        
    else:
        avg_rank = k

    return avg_rank, count_fails
In [25]:
# Ecaluate the perfomance of the six algorithm. Obviously, the distance-based
# algorithms peform better. Especially, the plain vanilla version of the algorithm
# has the best performance and also the smallest standart deviation. Howver, the
# distance-based algorithms are more computationally intensive than the other approaches.
def wrapper_eval(G, distance_array, k, rep):
    algorithm = ["Common Neighbors (FoF)", "Jaccard Coefficient", "Adamic and Adar function",
                 "Graph Distance", "Weighted Graph Distance (FoF)", "Weighted Graph Distance (Jaccard)", "Baseline"]
    rank_fof = []
    rank_jaccard = []
    rank_adamic_adar = []
    rank_distance = []
    rank_distance_fof = []
    rank_distance_jaccard = []
    rank_baseline = []
    
    for i in range(rep):
        edges = random_set_edges(G, k)
        rank_fof.append(eval_algos(g, "fof", edges, distance_array, k)[0])
        rank_jaccard.append(eval_algos(g, "jaccard", edges, distance_array, k)[0])
        rank_adamic_adar.append(eval_algos(g, "adamic_adar", edges, distance_array, k)[0])
        rank_distance.append(eval_algos(g, "distance", edges, distance_array, k)[0])
        rank_distance_fof.append(eval_algos(g, "distance_common_neigh", edges, distance_array, k)[0])
        rank_distance_jaccard.append(eval_algos(g, "distance_jaccard", edges, distance_array, k)[0])
        rank_baseline.append(eval_algos(g, "baseline", edges, distance_array, k)[0])

    rank_avg = []
    rank_avg.append(np.mean(rank_fof))
    rank_avg.append(np.mean(rank_jaccard))
    rank_avg.append(np.mean(rank_adamic_adar))
    rank_avg.append(np.mean(rank_distance))
    rank_avg.append(np.mean(rank_distance_fof))
    rank_avg.append(np.mean(rank_distance_jaccard))
    rank_avg.append(np.mean(rank_baseline))
    
    rank_std = []
    rank_std.append(np.std(rank_fof))
    rank_std.append(np.std(rank_jaccard))
    rank_std.append(np.std(rank_adamic_adar))
    rank_std.append(np.std(rank_distance))
    rank_std.append(np.std(rank_distance_fof))
    rank_std.append(np.std(rank_distance_jaccard))
    rank_std.append(np.std(rank_baseline))
    
    
    resultsdf = pd.DataFrame({"algorithm": algorithm, "average_rank": rank_avg, "stdev_rank": rank_std})
    return resultsdf
In [55]:
display(wrapper_eval(g, distance_array, 100, 5))
algorithm average_rank stdev_rank
0 Common Neighbors (FoF) 13.297 1.331561
1 Jaccard Coefficient 13.411 0.949513
2 Adamic and Adar function 12.481 1.312385
3 Graph Distance 1.001 0.002000
4 Weighted Graph Distance (FoF) 2.347 0.258720
5 Weighted Graph Distance (Jaccard) 2.448 0.187419
6 Baseline 100.000 0.000000
In [26]:
display(wrapper_eval(g, distance_array, 100, 20))
algorithm average_rank stdev_rank
0 Common Neighbors (FoF) 13.85875 1.698487
1 Jaccard Coefficient 13.54875 1.520829
2 Adamic and Adar function 13.03550 1.644221
3 Graph Distance 1.00000 0.000000
4 Weighted Graph Distance (FoF) 2.34900 0.375991
5 Weighted Graph Distance (Jaccard) 2.49900 0.511871
6 Baseline 97.70000 8.845055