# import necessary libraries
from scipy.sparse.csgraph import floyd_warshall
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random
# create the undirected graph
g = nx.read_edgelist("facebook_combined.txt",
create_using=nx.Graph(), nodetype=int)
# print the graph info
g = g.to_undirected()
print(nx.info(g))
# convert also a directed instance of a graph in
# case it is needed in any of the functionalities later
# and print the graph info. The number of edges is doubled.
g_dir = g.to_directed()
print(nx.info(g_dir))
# Baseline approach: randomly choose friend recommendations for the
# facebook users. The reccomendations are sorted based on the nodeid
# Obviously, this is a completely naive approach and should not have
# high accuracy.
def baseline(G, node1, k):
nodeid = random.sample(G.nodes(), k)
friend_rec = pd.DataFrame({"nodeid": nodeid})
friend_rec = friend_rec.sort_values(by=['nodeid'], ascending=[True])
friend_rec = friend_rec.reset_index(drop=True)
friend_rec.nodeid = friend_rec.nodeid.astype(int)
return friend_rec
# Common neighbors approach: according to this method the best friend reccomendation
# is the facebook user with whom you have the greatest number
# of mutual friends. In the case of ties in friendship score the node with the smallest
# nodeID is shown first.
def common_neighbors(G, node1, k):
nodeid = []
number_of_common_neighbors = []
for i in G.nodes():
if not G.has_edge(node1, i) and node1 != i:
nodeid.append(i)
number_of_common_neighbors.append(len(list(nx.common_neighbors(G,
node1,
i))))
friend_rec = pd.DataFrame({"nodeid": nodeid,
"number_of_common_neighbors": number_of_common_neighbors})
friend_rec = friend_rec.sort_values(by=['number_of_common_neighbors',
'nodeid'], ascending=[False, True])
friend_rec = friend_rec.head(k)
friend_rec = friend_rec.reset_index(drop=True)
return friend_rec
# Return a set with the common friends of two facebook users.
def union_neighbors(G, node1, node2):
return set(nx.all_neighbors(G, node1)) | set(nx.all_neighbors(G, node2))
# Jaccard coefficient approach: The Jaccard coefficient is a metric used to
# evaluate similarities. The Jaccard coefficient measures the similarity between
# sets and is defined as the size of the intersection divided by the size
# of the union. In the case of ties in friendship score the node with the smallest
# nodeID is shown first.
def jaccard_coef(G, node1, k):
nodeid = []
jaccard_coef = []
for i in G.nodes():
if not G.has_edge(node1, i) and node1 != i:
nodeid.append(i)
union_neigh = float(len(union_neighbors(G, node1, i)))
jaccard_coef.append(len(list(nx.common_neighbors(G, node1, i)))/union_neigh)
friend_rec = pd.DataFrame({"nodeid": nodeid,
"jaccard_coef": jaccard_coef})
friend_rec = friend_rec.sort_values(by=['jaccard_coef', 'nodeid'],
ascending=[False, True])
friend_rec = friend_rec.head(k)
friend_rec = friend_rec.reset_index(drop=True)
friend_rec.nodeid = friend_rec.nodeid.astype(int)
cols = friend_rec.columns.tolist()
cols = cols[-1:] + cols[:-1]
friend_rec = friend_rec[cols]
return friend_rec
# Return the number of friends of facebook user.
def find_neighbors(G, common_neighbor):
return len(list(nx.all_neighbors(G, common_neighbor)))
# Return the required logarithmic sum for the Adamic and Adar
# similarity function.
def calc_log_sum(G, common_neighbors):
log_sum = 0.0
for common_neighbor in common_neighbors:
log_sum = log_sum + 1 / math.log(find_neighbors(G,
common_neighbor), 10)
return log_sum
# Adamic and Adar function: Adamic and Adar proposed a weighted function for
# scoring similarity between users (nodes) in a network. The more similar a user
# is to another, the more likely a friendship will occur in the network. In the
# case of ties in friendship score the node with the smallest nodeID is shown first.
def adamic_adar(G, node1, k):
nodeid = []
adamic_adar_similarity = []
for i in G.nodes():
if not G.has_edge(node1, i) and node1 != i:
nodeid.append(i)
common_neighbors = nx.common_neighbors(G, node1, i)
adamic_adar_similarity.append(calc_log_sum(G, common_neighbors))
friend_rec = pd.DataFrame({"nodeid": nodeid,
"adamic_adar_similarity": adamic_adar_similarity})
friend_rec = friend_rec.sort_values(by=['adamic_adar_similarity',
'nodeid'],
ascending=[False, True])
friend_rec = friend_rec.head(k)
friend_rec = friend_rec.reset_index(drop=True)
friend_rec.nodeid = friend_rec.nodeid.astype(int)
cols = friend_rec.columns.tolist()
cols = cols[-1:] + cols[:-1]
friend_rec = friend_rec[cols]
return friend_rec
# compute the adjacency matrix of the undirected graph
A = nx.adjacency_matrix(g, nodelist=sorted(g.nodes()))
# compiute the distance matrix of the graph. The algorithm used
# is floyd_warshall while the distance matrix will be used for the
# metrics presented and implemented below.
distance_array = floyd_warshall(A, directed=False, unweighted=True)
# Graph distance approach: according to this approach the best friend
# reccomendation is the facebook user that is closer to you. In the
# case of ties in friendship score the node with the smallest nodeID
# is shown first.
def graph_distance(G, node1, distance_array, k):
nodeid = []
graph_distance_similarity = []
for i in G.nodes():
if not G.has_edge(node1, i) and node1 != i:
nodeid.append(i)
distance = distance_array[node1, i]
graph_distance_similarity.append(float(distance))
friend_rec = pd.DataFrame({"nodeid": nodeid,
"graph_distance_similarity": graph_distance_similarity})
friend_rec = friend_rec.sort_values(by=['graph_distance_similarity',
'nodeid'], ascending=[True, True])
friend_rec = friend_rec.head(k)
friend_rec = friend_rec.reset_index(drop=True)
friend_rec.nodeid = friend_rec.nodeid.astype(int)
cols = friend_rec.columns.tolist()
cols = cols[-1:] + cols[:-1]
friend_rec = friend_rec[cols]
return friend_rec
# Weighted Graph Distance (FoF): this approach takes into account not only
# the distance among two facebook users but also their common friends.
# In that sense, it could be considered a weighted graph distance metric
# with the common neighbors as a weighting factor. In the
# case of ties in friendship score the node with the smallest nodeID
# is shown first.
def distance_common_neigh(G, node1, distance_array, k):
nodeid = []
graph_distance_similarity = []
for i in G.nodes():
if not G.has_edge(node1, i) and node1 != i:
nodeid.append(i)
common_neighbors = len(list(nx.common_neighbors(G, node1, i)))
distance = distance_array[node1, i]
graph_distance_similarity.append(common_neighbors/float(distance))
friend_rec = pd.DataFrame({"nodeid": nodeid,
"graph_distance_similarity": graph_distance_similarity})
friend_rec = friend_rec.sort_values(by=['graph_distance_similarity',
'nodeid'], ascending=[False, True])
friend_rec = friend_rec.head(k)
friend_rec = friend_rec.reset_index(drop=True)
friend_rec.nodeid = friend_rec.nodeid.astype(int)
cols = friend_rec.columns.tolist()
cols = cols[-1:] + cols[:-1]
friend_rec = friend_rec[cols]
return friend_rec
# Weighted Graph Distance (Jaccard): this approach takes into account not only
# the distance among two facebook users but also the jaccard coefficient.
# In that sense, it could be considered a weighted graph distance metric
# with the jaccard coefficient as a weighting factor. In the
# case of ties in friendship score the node with the smallest nodeID
# is shown first.
def distance_jaccard(G, node1, distance_array, k):
nodeid = []
distance_jaccard_similarity = []
for i in G.nodes():
if not G.has_edge(node1, i) and node1 != i:
nodeid.append(i)
union_neigh = float(len(union_neighbors(G, node1, i)))
common_neighbors = len(list(nx.common_neighbors(G, node1, i)))
jaccard_coef = common_neighbors / union_neigh
distance = distance_array[node1, i]
distance_jaccard_similarity.append(jaccard_coef/distance)
friend_rec = pd.DataFrame({"nodeid": nodeid,
"distance_jaccard_similarity": distance_jaccard_similarity})
friend_rec = friend_rec.sort_values(by=['distance_jaccard_similarity',
'nodeid'], ascending=[False, True])
friend_rec = friend_rec.head(k)
friend_rec = friend_rec.reset_index(drop=True)
friend_rec.nodeid = friend_rec.nodeid.astype(int)
cols = friend_rec.columns.tolist()
cols = cols[-1:] + cols[:-1]
friend_rec = friend_rec[cols]
return friend_rec
The ouput of the aformentioned metrics is presented for the facebook users with nodeId [107, 1126, 14, 35].
fb_users = [107, 1126, 14, 35]
for fb_user in fb_users:
print("Current facebook user:", fb_user)
display(common_neighbors(g, fb_user, 10))
for fb_user in fb_users:
print("Current facebook user:", fb_user)
display(jaccard_coef(g, fb_user, 10))
for fb_user in fb_users:
print("Current facebook user:", fb_user)
display(adamic_adar(g, fb_user, 10))
for fb_user in fb_users:
print("Current facebook user:", fb_user)
display(graph_distance(g, fb_user, distance_array, 10))
for fb_user in fb_users:
print("Current facebook user:", fb_user)
display(distance_common_neigh(g, fb_user, distance_array, 10))
for fb_user in fb_users:
print("Current facebook user:", fb_user)
display(distance_jaccard(g, fb_user, distance_array, 10))
# Examine whether the scoring functions give different
# recommendations for specific users. We consider only the
# facebook user with a nodeId, which is a multiple of 10, i.e.
# 40 users in total. The distance based approaches seem to be
# completely different from the other.
def compare_algos(G, distance_array, k):
same_rec = 0
dif_rec = 0
fof_jaccard = 0.0
jaccard_adamic = 0.0
fof_adamic = 0.0
fof_distance = 0.0
fof_distance_common = 0.0
fof_distance_jaccard = 0.0
jaccard_distance = 0.0
jaccard_distance_common = 0.0
jaccard_distance_jaccard = 0.0
adamic_distance = 0.0
adamic_distance_common = 0.0
adamic_distance_jaccard = 0.0
for user in range(100, len(G.nodes()), 100):
fof_metric = sorted(common_neighbors(G, user, k).nodeid.values)
jaccard_metric = sorted(jaccard_coef(G, user, k).nodeid.values)
adamic_adar_metric = sorted(adamic_adar(G, user, k).nodeid.values)
distance_metric = sorted(graph_distance(G, user, distance_array, k).nodeid.values)
distance_common_metric = sorted(distance_common_neigh(G, user, distance_array, k).nodeid.values)
distance_jaccard_metric = sorted(distance_jaccard(G, user, distance_array, k).nodeid.values)
bool_first = (fof_metric == jaccard_metric) and (jaccard_metric == adamic_adar_metric) and (fof_metric == adamic_adar_metric)
bool_sec = (fof_metric == distance_metric) and (jaccard_metric == distance_common_metric) and (distance_jaccard_metric == adamic_adar_metric)
bool_third = (fof_metric == distance_common_metric) and (jaccard_metric == distance_jaccard_metric) and (distance_metric == adamic_adar_metric)
bool_fourth = (fof_metric == distance_jaccard_metric) and (jaccard_metric == distance_metric) and (distance_common_metric == adamic_adar_metric)
if bool_first and bool_sec and bool_third and bool_fourth:
same_rec += 1
else:
dif_rec += 1
fof_jaccard += len(set(fof_metric) & set(jaccard_metric)) / float(len(set(fof_metric) | set(jaccard_metric)))
jaccard_adamic += len(set(adamic_adar_metric) & set(jaccard_metric)) / float(len(set(adamic_adar_metric) | set(jaccard_metric)))
fof_adamic += len(set(fof_metric) & set(adamic_adar_metric)) / float(len(set(fof_metric) | set(adamic_adar_metric)))
fof_distance = len(set(fof_metric) & set(distance_metric)) / float(len(set(fof_metric) | set(distance_metric)))
fof_distance_common = len(set(fof_metric) & set(distance_common_metric)) / float(len(set(fof_metric) | set(distance_common_metric)))
fof_distance_jaccard = len(set(fof_metric) & set(distance_jaccard_metric)) / float(len(set(fof_metric) | set(distance_jaccard_metric)))
jaccard_distance = len(set(jaccard_metric) & set(distance_metric)) / float(len(set(jaccard_metric) | set(distance_metric)))
jaccard_distance_common = len(set(jaccard_metric) & set(distance_common_metric)) / float(len(set(jaccard_metric) | set(distance_common_metric)))
jaccard_distance_jaccard = len(set(jaccard_metric) & set(distance_jaccard_metric)) / float(len(set(jaccard_metric) | set(distance_jaccard_metric)))
adamic_distance = len(set(adamic_adar_metric) & set(distance_metric)) / float(len(set(adamic_adar_metric) | set(distance_metric)))
adamic_distance_common = len(set(adamic_adar_metric) & set(distance_common_metric)) / float(len(set(adamic_adar_metric) | set(distance_common_metric)))
adamic_distance_jaccard = len(set(adamic_adar_metric) & set(distance_jaccard_metric)) / float(len(set(adamic_adar_metric) | set(distance_jaccard_metric)))
fof_jaccard /= 40
jaccard_adamic /= 40
fof_adamic /= 40
fof_distance /= 40
fof_distance_common /= 40
fof_distance_jaccard /= 40
jaccard_distance /= 40
jaccard_distance_common /= 40
jaccard_distance_jaccard /= 40
adamic_distance /= 40
adamic_distance_common /= 40
adamic_distance_jaccard /= 40
average_similarity = float(fof_jaccard + jaccard_adamic + fof_adamic + fof_distance + fof_distance_common +
fof_distance_jaccard + jaccard_distance + jaccard_distance_common + jaccard_distance_jaccard +
adamic_distance + adamic_distance_common + adamic_distance_jaccard) / 12
print("Users with same recommendations:", same_rec)
print("Users with different recommendations:", dif_rec)
print()
print("FoF - Jaccard similarity percentage:", fof_jaccard)
print("Jaccard - Adamic similarity percentage:", jaccard_adamic)
print("FoF - Adamic similarity percentage:", fof_adamic)
print()
print("FoF - Distance similarity percentage:", fof_distance)
print("FoF - Weighted Distance (FoF) similarity percentage:", fof_distance_common)
print("FoF - Weighted Distance (Jaccard) similarity percentage:", fof_distance_jaccard)
print()
print("Jaccard - Distance similarity percentage:", jaccard_distance)
print("Jaccard - Weighted Distance (FoF) similarity percentage:", jaccard_distance_common)
print("Jaccard - Weighted Distance (Jaccard) similarity percentage:", jaccard_distance_jaccard)
print()
print("Adamic - Distance similarity percentage:", adamic_distance)
print("Adamic - Weighted Distance (FoF) similarity percentage:", adamic_distance_common)
print("Adamic - Weighted Distance (Jaccard) similarity percentage:", adamic_distance_jaccard)
print()
print("Average similarity between the algorithms:", average_similarity)
compare_algos(g, distance_array, 10)
# Return a set of random edges of the graph.
# The number of the required edges has to be specified.
def random_choices(G, n):
edges = random.sample(G.edges(), n)
return edges
# Return a set of random edges which can be
# used for the evaluation of the six different metrics.
# The same set of random edges is used for the evaluation of the
# algorithms,to prevent different random choices from skewing
# the results.
def random_set_edges(G, k):
random_edges = random_choices(G, len(G.edges()))
selected_edges = []
counter = 0
for edge in random_edges:
G.remove_edge(*edge)
metric_f1_fof = common_neighbors(G, edge[0], k).nodeid.values
metric_f2_fof = common_neighbors(G, edge[1], k).nodeid.values
metric_f1_jaccard = jaccard_coef(G, edge[0], k).nodeid.values
metric_f2_jaccard = jaccard_coef(G, edge[1], k).nodeid.values
metric_f1_adamic = adamic_adar(G, edge[0], k).nodeid.values
metric_f2_adamic = adamic_adar(G, edge[1], k).nodeid.values
metric_f1_distance_common = distance_common_neigh(G, edge[0], distance_array, k).nodeid.values
metric_f2_distance_common = distance_common_neigh(G, edge[1], distance_array, k).nodeid.values
metric_f1_distance_jaccard = distance_jaccard(G, edge[0], distance_array, k).nodeid.values
metric_f2_distance_jaccard = distance_jaccard(G, edge[1], distance_array, k).nodeid.values
metric_f1_distance = graph_distance(G, edge[0], distance_array, k).nodeid.values
metric_f2_distance = graph_distance(G, edge[1], distance_array, k).nodeid.values
if edge[1] not in metric_f1_fof or edge[0] not in metric_f2_fof:
G.add_edge(*edge)
continue
if edge[1] not in metric_f1_jaccard or edge[0] not in metric_f2_jaccard:
G.add_edge(*edge)
continue
if edge[1] not in metric_f1_adamic or edge[0] not in metric_f2_adamic:
G.add_edge(*edge)
continue
if edge[1] not in metric_f1_distance or edge[0] not in metric_f2_distance:
G.add_edge(*edge)
continue
if edge[1] not in metric_f1_distance_jaccard or edge[0] not in metric_f2_distance_jaccard:
G.add_edge(*edge)
continue
if edge[1] not in metric_f1_distance_common or edge[0] not in metric_f2_distance_common:
G.add_edge(*edge)
continue
counter += 1
selected_edges.append(edge)
G.add_edge(*edge)
if counter == 100:
break
return selected_edges
# Evaluate the performance of each algorithm
def eval_algos(G, algorithm, edges, distance_array, k):
count_fails = 0
avg_rank = 0
for edge in edges:
G.remove_edge(*edge)
if algorithm == "fof":
metric_f1 = common_neighbors(G, edge[0], k).nodeid.values
metric_f2 = common_neighbors(G, edge[1], k).nodeid.values
elif algorithm == "jaccard":
metric_f1 = jaccard_coef(G, edge[0], k).nodeid.values
metric_f2 = jaccard_coef(G, edge[1], k).nodeid.values
elif algorithm == "adamic_adar":
metric_f1 = adamic_adar(G, edge[0], k).nodeid.values
metric_f2 = adamic_adar(G, edge[1], k).nodeid.values
elif algorithm == "baseline":
metric_f1 = baseline(G, edge[0], k).nodeid.values
metric_f2 = baseline(G, edge[1], k).nodeid.values
elif algorithm == "distance_common_neigh":
metric_f1 = distance_common_neigh(G, edge[0], distance_array, k).nodeid.values
metric_f2 = distance_common_neigh(G, edge[1], distance_array, k).nodeid.values
elif algorithm == "distance_jaccard":
metric_f1 = distance_jaccard(G, edge[0], distance_array, k).nodeid.values
metric_f2 = distance_jaccard(G, edge[1], distance_array, k).nodeid.values
elif algorithm == "distance":
metric_f1 = graph_distance(G, edge[0], distance_array, k).nodeid.values
metric_f2 = graph_distance(G, edge[1], distance_array, k).nodeid.values
if edge[1] not in metric_f1 or edge[0] not in metric_f2:
G.add_edge(*edge)
count_fails += 1
continue
# add one to the rank to be greater than zero
rank_f2inf1 = np.where(metric_f1 == edge[1])[0][0] + 1
rank_f1inf2 = np.where(metric_f2 == edge[0])[0][0] + 1
# get the avrage rank
avg_rank += float(rank_f2inf1 + rank_f1inf2) / 2
G.add_edge(*edge)
if (100 - count_fails) != 0:
avg_rank /= float(100 - count_fails)
else:
avg_rank = k
return avg_rank, count_fails
# Ecaluate the perfomance of the six algorithm. Obviously, the distance-based
# algorithms peform better. Especially, the plain vanilla version of the algorithm
# has the best performance and also the smallest standart deviation. Howver, the
# distance-based algorithms are more computationally intensive than the other approaches.
def wrapper_eval(G, distance_array, k, rep):
algorithm = ["Common Neighbors (FoF)", "Jaccard Coefficient", "Adamic and Adar function",
"Graph Distance", "Weighted Graph Distance (FoF)", "Weighted Graph Distance (Jaccard)", "Baseline"]
rank_fof = []
rank_jaccard = []
rank_adamic_adar = []
rank_distance = []
rank_distance_fof = []
rank_distance_jaccard = []
rank_baseline = []
for i in range(rep):
edges = random_set_edges(G, k)
rank_fof.append(eval_algos(g, "fof", edges, distance_array, k)[0])
rank_jaccard.append(eval_algos(g, "jaccard", edges, distance_array, k)[0])
rank_adamic_adar.append(eval_algos(g, "adamic_adar", edges, distance_array, k)[0])
rank_distance.append(eval_algos(g, "distance", edges, distance_array, k)[0])
rank_distance_fof.append(eval_algos(g, "distance_common_neigh", edges, distance_array, k)[0])
rank_distance_jaccard.append(eval_algos(g, "distance_jaccard", edges, distance_array, k)[0])
rank_baseline.append(eval_algos(g, "baseline", edges, distance_array, k)[0])
rank_avg = []
rank_avg.append(np.mean(rank_fof))
rank_avg.append(np.mean(rank_jaccard))
rank_avg.append(np.mean(rank_adamic_adar))
rank_avg.append(np.mean(rank_distance))
rank_avg.append(np.mean(rank_distance_fof))
rank_avg.append(np.mean(rank_distance_jaccard))
rank_avg.append(np.mean(rank_baseline))
rank_std = []
rank_std.append(np.std(rank_fof))
rank_std.append(np.std(rank_jaccard))
rank_std.append(np.std(rank_adamic_adar))
rank_std.append(np.std(rank_distance))
rank_std.append(np.std(rank_distance_fof))
rank_std.append(np.std(rank_distance_jaccard))
rank_std.append(np.std(rank_baseline))
resultsdf = pd.DataFrame({"algorithm": algorithm, "average_rank": rank_avg, "stdev_rank": rank_std})
return resultsdf
display(wrapper_eval(g, distance_array, 100, 5))
display(wrapper_eval(g, distance_array, 100, 20))