fastExecution = True # Whether to use or not pre build short-cut files to skip long execution bloc of codes
savingFigures = False # Whether to save or not the figures produced
savingData = False # Whether to build or not the short cut files for future fastExecution
# Import Libraries
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import re
import fa2
import math
import community
import matplotlib.cm as cm
import pickle
from __future__ import division
import matplotlib.image as mpimg
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import io
from collections import Counter
from wordcloud import WordCloud
from scipy.special import zeta
# Rendering Parameters
title_font = {'family': 'sans-serif',
'color': '#000000',
'weight': 'bold',
'size': 28,
}
#COLORS
mBlue = "#55638A" # For actor
fRed = "#9E1030" # For actress
The cleaning and preprocessing of data can be found in Explainer notebook part 1 and 2.
Here we get the movies and actors data frames (moviesDF,actorsDF), the movie age dictionnary (movieAgeDict), the rating one (ratingDict), the movies and actors names dictionnaries (actorNameDict,movieNameDict) and one list containing all the collaboration between actors (actorsLinks)
from loadData import cleanLoadData
movieAgeDict,ratingDict,actorNameDict,movieNameDict,moviesDF,actorsDF,actorsLinks = cleanLoadData()
actorsDF.head(10)
moviesDF.sort_values("Rating", ascending=False).head(10)
We choose to build a graph where the nodes are the actors, and the links represent the collaboration between two actors: the movies they both participate in, the ratings they got and the average year of their collaborations
##########################
# Create the actors Graph
##########################
G = nx.Graph()
#add nodes
for i in actorsDF.index:
G.add_node(actorsDF.loc[i].at["iD"], Name= actorsDF.loc[i, "Name"], Gender = actorsDF.loc[i, "Gender"])
#add links
for link in actorsLinks:
if link[0] != link[1]:
if G.has_edge(link[0],link[1]): #Update existing edges
G[link[0]][link[1]]["weight"] +=1
G[link[0]][link[1]]["movies"].append(link[2])
#Average Rating
avRating = (G[link[0]][link[1]]["avRating"])*(1-1.0/G[link[0]][link[1]]["weight"]) #Former ratings
avRating += ratingDict[link[2]]/G[link[0]][link[1]]["weight"] # Added Movie
G[link[0]][link[1]]["avRating"] = avRating
#Average Year
avYear = (G[link[0]][link[1]]["avYear"])*(1-1.0/G[link[0]][link[1]]["weight"]) #Former ratings
avYear += movieAgeDict[link[2]]/G[link[0]][link[1]]["weight"] # Added Movie
G[link[0]][link[1]]["avYear"] = avYear
else: #Create new edge
G.add_edge(link[0], link[1],
weight = 1,
movies = [link[2]],
avRating = ratingDict[link[2]],
avYear = movieAgeDict[link[2]])
We took the giant component to have a connected graph to study.
##################
# Clean the Graph
##################
G = max(nx.connected_component_subgraphs(G), key=len)
if savingData:
nx.write_gpickle(G, 'obj/full.gpickle')
print "The graph has "+str(G.number_of_nodes())+" nodes (actors) and "+str(G.number_of_edges())+" edges (collaborations)"
#################################################
# Set the node colors actording to actors gender
#################################################
def getColors(graph):
mBlue = "#55638A" # For actor
fRed = "#9E1030" # For actress
colors = {} # Build the color
k=0
for n in graph.nodes:
if graph.nodes[n]['Gender'] == "F":
colors[n]= fRed
else:
colors[n]= mBlue
return colors
We use separate colors for men and women to see if they have different influence in the graph.
###############################################
# Set the edge colors actording to movie years
###############################################
def getEdgeColors(graph):
c1930 = "#c12424"
c1955 = "#ff6612"
c1980 = "#ffce00"
c1995 = "#e3f018"
cNow = "#bdff00"
edgesColors = {}
for e in G.edges:
edgesColors[e] = c1930 #RED
if graph.get_edge_data(*e)["avYear"]>1930:
edgesColors[e] = c1955 #ORANGE
if graph.get_edge_data(*e)["avYear"]>1955:
edgesColors[e] = c1980 #YELLOW
if graph.get_edge_data(*e)["avYear"]>1980:
edgesColors[e] = c1995 #LIGHT GREEN
if graph.get_edge_data(*e)["avYear"]>1995:
edgesColors[e] = cNow #GREEN
return edgesColors
The edges will have colors according to the average year of the two actors collaborations, from red for old movies to green for the most recent ones. With this can show how the age is important in this network.
###########################################################
# Set the size of the nodes according to outgoing strength
###########################################################
sizes = {}
actorsDF["Collab"] = pd.Series(np.zeros(len(actorsDF.index))) #to store the outgoing strength
for i in actorsDF.index: # go through actors
iD =actorsDF.loc[i].at["iD"]
if iD in G.nodes(): # if actor in the grpah
edges = list(G.edges(iD))
ogStrength = 0
for e in edges: # go though his edges
ogStrength += G.get_edge_data(*e)["weight"] #update outgoing strength
actorsDF.loc[i, "Collab"]= ogStrength
sizes[iD] = ogStrength
else :
actorsDF.loc[i, "Collab"]= 0
actorsDF=actorsDF.sort_values("Collab", ascending=False) #Sort actors DF
In this network, the outgoing strength correponds to the number of times an actor has collaborated with other. It tends to grow with the number of movies done and the number of actors in those movies. Then it represent the importance of the actor in the industry. </br> We add the names of the 10 biggest actors on the plot.
####################################################################
# Display the name of the actors with the biggest outgoing strength
####################################################################
SortedNames = np.asarray(actorsDF["Name"])
SortedNames = SortedNames[:10]
labels = {}
for n in G.nodes():
name = actorNameDict[n]
if name in SortedNames:
labels[n]="\n\n"+name
else:
labels[n]=""
We used the Force Atlas algorithm to have a better rendering of the network.
########################
# Force Atlas Algorithm
########################
forceatlas2 = fa2.ForceAtlas2(
# Behavior alternatives
outboundAttractionDistribution=False, # Dissuade hubs
linLogMode=False, # NOT IMPLEMENTED
adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED)
edgeWeightInfluence=1.0,
# Performance
jitterTolerance=1.0, # Tolerance
barnesHutOptimize=True,
barnesHutTheta=1.2,
multiThreaded=False, # NOT IMPLEMENTED
# Tuning
scalingRatio=0.005,
strongGravityMode=False,
gravity=20,
# Log
verbose=True)
The position can be obtained by the force atlas algorithm but as the graph is pretty big so it is pretty long (about 15 mn). To make it faster we save the position dictionnary in a txt file, and we can directly read the positions from this file in the "fastExecution" mode.
####################
# Get the positions
####################
pos={} # Position of the nodes in the force2 algorithm
if fastExecution:
path = "DATA/forceAtlasPositions.txt"
files = io.open(path, mode="r", encoding="utf-8")
for row in files:
split = row.split("\t")
pos[split[0]] = (float(split[1]),float(split[2]))
files.close()
else:
pos = forceatlas2.forceatlas2_networkx_layout(G, pos=None, iterations=3000) #~15 mn
if savingData:
#Store the positions in a text file
path = "DATA/forceAtlasPositions.txt"
files = io.open(path, mode="w", encoding="utf-8")
for key in pos.keys():
row = key +"\t" + str(pos[key][0]) +"\t"+str(pos[key][1])+"\r\n"
files.write(row.replace("u'","").replace("'",""))
files.close()
The size are compute regarding the outgoing strenght. To amplify the results and then make the plot more readable we use the square of the outgoing strenght. Nodes with a big outgoing strength will the appear bigger and small ones smaller. </br>
$nodeSize = \left(0.2\cdot outgoingStrength\right)^2$
#######
# Draw
#######
#Get colors
colors = getColors(G)
edgesColors = getEdgeColors(G)
#Plot
fig = plt.figure(figsize=(30, 30))
nx.draw_networkx(G, pos,
node_size = [(0.2*sizes[n])**(2) for n in G.nodes()],
node_color = [colors[n] for n in G.nodes()],
with_labels=True,
width = 0.1,
edge_color=[edgesColors[e] for e in G.edges()],
labels = labels,
font_size = 17,
font_weight= "bold")
plt.axis('off')
plt.title("IMDb Actors Graph", fontdict = title_font )
base = 'Figures/actorGraph'
if savingFigures:
plt.savefig(base+'.jpeg', bbox_inches='tight')
plt.savefig(base+'.svg', bbox_inches='tight')
plt.show()
We can distinguish 3 main groups:
</br>
We can see that, the impact of the collaborations'age. Old American actors are regroup in the same area of the graph. Other old links are harder to visualize. This could be explained by the fact that movie industry was not very big at the time and not very internationnal. Then the links between old non American actors and the bulk of the actors are rare.
Let's build a function to get a distribution histogram from a list or an array of data.
######################
# HISTOGRAM FUNCTION
######################
def histogram(degrees, dens): # degrees (list or array of data), dens (whether it is a density histogram or not)
# Computing Bins
min_bin = np.amin(degrees)
max_bin = np.amax(degrees)
nb_bins = int(math.ceil(max_bin)-math.floor(min_bin))
v = np.empty(nb_bins+1)
v[nb_bins] = int(math.ceil(max_bin))
bins = np.empty(nb_bins)
for i in range(nb_bins):
v[i] = int(min_bin + i)
bins[i] = int(min_bin + i)
#Hist
hist, bin_edges = np.histogram(degrees,bins = v,density=dens)
return hist, bin_edges
Let's look how the movies and collaborations a spread across the years
##################################
# HISTOGRAM OF THE MOVIES BY YEAR
##################################
moviesYear = np.asarray(moviesDF["Year"])
linksYear = []
for link in actorsLinks:
if link[0] != link[1]:
year = movieAgeDict[link[2]]
linksYear.append(year)
# Get the histograms
histM, binsM = histogram(moviesYear,False)
histL, binsL = histogram(linksYear,False)
plt.figure(figsize = (15,6))
plt.bar(binsM[:-1], histM, 0.35, color=mBlue, label = "Movies")
plt.bar([b+0.4 for b in binsL[:-1]], histL, 0.35, color=fRed, label = "Links")
plt.xlabel('Year')
plt.ylabel('Occurences')
plt.title('Movies and links distribution', fontdict = title_font)
plt.legend()
base = 'Figures/moviesDist'
if savingFigures:
plt.savefig(base+'.jpeg', bbox_inches='tight')
plt.savefig(base+'.png', bbox_inches='tight')
plt.savefig(base+'.svg', bbox_inches='tight')
plt.show()
The gap in the 1960s corresponds to the worst year for the american industry. Very few movies where produced at this time. Since the 1980s we can see a huge raise in the number of collaborations, but a smaller one for the movies. This could indicate that more and more movies are produced every year since this period, and more and more actors plays in those movies.</br>
So we divide the data into movies from before 1970, then between 1970 and 1980, 1980 and 1990, 1990 and 2000 and finally after 2000
############################
# REPARTITION AMONG PERIODS
############################
#Define periods and set counts
periods = [1900,1970,1980,1990,2000]
moviesByPeriods = np.zeros(5,dtype=int)
linksByPeriods = np.zeros(5,dtype=int)
nMovies = 0
nLinks = 0
#Go through movies
for i in moviesDF.index:
age = moviesDF.loc[i, "Year"]
nMovies +=1
if age < 1970:
moviesByPeriods[0]+=1
elif age < 1980:
moviesByPeriods[1]+=1
elif age < 1990:
moviesByPeriods[2]+=1
elif age < 2000:
moviesByPeriods[3]+=1
else:
moviesByPeriods[4]+=1
#Go through links
for e in G.edges():
age = G.get_edge_data(*e)["avYear"]
nLinks +=1
if age < 1970:
linksByPeriods[0]+=1
elif age < 1980:
linksByPeriods[1]+=1
elif age < 1990:
linksByPeriods[2]+=1
elif age < 2000:
linksByPeriods[3]+=1
else:
linksByPeriods[4]+=1
print "Period 1900-1970: "+str(moviesByPeriods[0])+"("+str(round(100*moviesByPeriods[0]/nMovies,2))+"%)"+" movies"+" - "+str(linksByPeriods[0])+"("+str(round(100*linksByPeriods[0]/nLinks,2))+"%)"+" links."
print "Period 1970-1980: "+str(moviesByPeriods[1])+"("+str(round(100*moviesByPeriods[1]/nMovies,2))+"%)"+" movies"+" - "+str(linksByPeriods[1])+"("+str(round(100*linksByPeriods[1]/nLinks,2))+"%)"+" links"
print "Period 1980-1990: "+str(moviesByPeriods[2])+"("+str(round(100*moviesByPeriods[2]/nMovies,2))+"%)"+" movies"+" - "+str(linksByPeriods[2])+"("+str(round(100*linksByPeriods[2]/nLinks,2))+"%)"+" links"
print "Period 1990-2000: "+str(moviesByPeriods[3])+"("+str(round(100*moviesByPeriods[3]/nMovies,2))+"%)"+" movies"+" - "+str(linksByPeriods[3])+"("+str(round(100*linksByPeriods[3]/nLinks,2))+"%)"+" links"
print "Period 2000+: "+str(moviesByPeriods[4])+"("+str(round(100*moviesByPeriods[4]/nMovies,2))+"%)"+" movies"+" - "+str(linksByPeriods[4])+"("+str(round(100*linksByPeriods[4]/nLinks,2))+"%)"+" links"
Build a function to get the graph corresponding to a specific period
###################################
# Graph per period function
###################################
def graphPeriod(start,end):
G_per = nx.Graph()
#add nodes
for i in actorsDF.index:
G_per.add_node(actorsDF.loc[i].at["iD"], Name= actorsDF.loc[i, "Name"], Gender = actorsDF.loc[i, "Gender"])
#add links
for link in actorsLinks:
if (start < movieAgeDict[link[2]]) and (movieAgeDict[link[2]]<= end):
if link[0] != link[1]:
if G_per.has_edge(link[0],link[1]):
G_per[link[0]][link[1]]["weight"] +=1
G_per[link[0]][link[1]]["movies"].append(link[2])
G_per[link[0]][link[1]]["avRating"] = (G_per[link[0]][link[1]]["avRating"])*(1-1.0/G_per[link[0]][link[1]]["weight"])+ratingDict[link[2]]/G_per[link[0]][link[1]]["weight"]
G_per[link[0]][link[1]]["avYear"] = (G_per[link[0]][link[1]]["avYear"])*(1-1.0/G_per[link[0]][link[1]]["weight"])+movieAgeDict[link[2]]/G_per[link[0]][link[1]]["weight"]
else:
G_per.add_edge(link[0], link[1], weight = 1, movies = [link[2]], avRating = ratingDict[link[2]], avYear = movieAgeDict[link[2]])
#take the giant component
G_per=max(nx.connected_component_subgraphs(G_per), key=len)
print "There are "+str(G_per.number_of_nodes()) +" nodes(actors) and "+ str(G_per.number_of_edges())+ " links(movie collaboration) in "+str(start)+'-'+str(end)+" period."
return G_per
###################################
# Subdivide the network by period
###################################
graphByPeriod = {}
for i in range(len(periods)):
if i <4:
gph = graphPeriod(periods[i],periods[i+1])
if savingData:
nx.write_gpickle(G, 'obj/graph_'+str(periods[i+1])+'.gpickle')
graphByPeriod[str(periods[i+1])]=gph
else:
gph = graphPeriod(periods[i],2020)
graphByPeriod["now"]=gph
if savingData:
nx.write_gpickle(G, 'obj/graph_now.gpickle')
# Graph Titles
titles = {}
titles["1970"] = "1900-1970"
titles["1980"] = "1970-1980"
titles["1990"] = "1980-1990"
titles["2000"] = "1990-2000"
titles["now"] = "2000+"
We store the graph and the corresponding data in dictionaries which makes it easy to access after.
Same as before for the full graph, there two options either directly with the force atlas algorithm or with the short-cut files.
################################
# Graph per period positionning
################################
positionsPeriod = {}
if not fastExecution:
for key in graphByPeriod:
p = forceatlas2.forceatlas2_networkx_layout(graphByPeriod[key], pos=None, iterations=3000)
positionsPeriod[key] = p
if savingData:
#Build a shortcut to speed up and not re-run the algorithm
#Store the positions in a text file
path = "DATA/forceAtlasPositions_"+key+".txt"
files = io.open(path, mode="w", encoding="utf-8")
for key in p.keys():
row = key +"\t" + str(p[key][0]) +"\t"+str(p[key][1])+"\r\n"
files.write(row.replace("u'","").replace("'",""))
files.close()
else:
#Get the Dictionnary
for key in graphByPeriod:
posit={}
path = "DATA/forceAtlasPositions_"+key+".txt"
files = io.open(path, mode="r", encoding="utf-8")
for row in files:
split = row.split("\t")
posit[split[0]] = (float(split[1]),float(split[2]))
files.close()
positionsPeriod[key] = posit
We build a draw function to automatize the drawing of the period graphs. In input it receive the considered graph, the title of the representation build and the positions of the nodes given by the force atlas algorithm. </br> Sizes are computed the same way as for the full graph before.
################
# DRAW FUNCTION
################
#Auxiliary function
def getSecond(a):
return a[1]
def draw(graph,ttl,posi):
colors = getColors(graph) # Build the color and size arrays
#SIZE
# Get the actor/actress with the biggest number of collaborations
sizes = {}
os = []
sizeMax =0
for iD in graph.nodes():
edges = list(graph.edges(iD))
ogStrength = 0
for e in edges:
ogStrength += graph.get_edge_data(*e)["weight"]
sizes[iD] = ogStrength
os.append((graph.nodes[iD]["Name"],ogStrength))
if ogStrength > sizeMax:
sizeMax = ogStrength
#LABEL
# Build a label dictionnary with the name of the members to highlight
SortedNames = np.asarray(sorted(os, key=getSecond,reverse = True))[:,0]
SortedNames = SortedNames[:10]
labels = {}
for n in graph.nodes():
name = actorNameDict[n]
if name in SortedNames:
labels[n]="\n\n"+name
else:
labels[n]=""
#POSITIONNING
positions = posi
alpha =25/sizeMax
fig = plt.figure(figsize=(30, 30))
nx.draw_networkx(graph, positions,
node_size = [(alpha*sizes[n])**(2) for n in graph.nodes()], node_color = [colors[n] for n in graph.nodes()],
with_labels=True,
width = 0.1,edge_color='#999999',labels = labels,font_size = 17, font_weight= "bold")
plt.axis('off')
plt.title("Actors Graph Period "+ttl, fontdict = title_font )
base = 'Figures/actorGraph_'+ttl
if savingFigures:
plt.savefig(base+'.jpeg', bbox_inches='tight')
plt.savefig(base+'.png', bbox_inches='tight')
plt.savefig(base+'.svg', bbox_inches='tight')
plt.show()
####################
# Draw the networks
####################
for key in graphByPeriod.keys():
draw(graphByPeriod[key],titles[key], positionsPeriod[key])
In general in the center of the graph, we find american actors because it is the major industry, and on the outside, actors from smaller industries (internatinnaly speaking) can emerge: Indians in the 2000+ graph for example.
################################
# Display the degrees histogram
################################
# Get the degrees
mDegrees = [G.degree(n) for n in G.nodes() if G.nodes[n]["Gender"] == "M"]
fDegrees = [G.degree(n) for n in G.nodes() if G.nodes[n]["Gender"] == "F"]
degrees = [G.degree(n) for n in G.nodes()]
# Get the histograms
histM, binsM = histogram(mDegrees,False)
histF, binsF = histogram(fDegrees,False)
histD, binsD = histogram(degrees,True)
# Plot
plt.figure(figsize = (15,6))
plt.bar(binsM[:-1], histM, 0.35, color=mBlue, label = "Degrees of Actors")
plt.bar([b+0.4 for b in binsF[:-1]], histF, 0.35, color=fRed, label = "Degrees of Actress")
plt.xlabel('Degrees')
plt.ylabel('Number of occurences')
plt.suptitle('Degree distribution', fontsize=16)
plt.legend()
base = 'Figures/degreeDist'
if savingFigures:
plt.savefig(base+'.jpeg', bbox_inches='tight')
plt.savefig(base+'.png', bbox_inches='tight')
plt.savefig(base+'.svg', bbox_inches='tight')
plt.show()
There are less actress than actors but the distribution of their degrees are really similar.
Let's compare the degree distributions with random networks ones (Poisson) and scale free ones.
#######################
# Degree distributions
#######################
def PoissDegree(ks,avg):
return [np.exp(-avg)*avg**k/math.factorial(int(k)) for k in ks]
def ScaleFreeDegree(ks,gamma):
return [k**(-gamma)/zeta(gamma) for k in ks]
################################################
# Degree distributions ploting on log log scale
################################################
# Log-Log Plot zoom on scale free
plt.figure(figsize = (10,4))
title_font['size'] = 20
plt.title("Log-Log Degree Distribution", fontdict = title_font)
plt.loglog(binsD[:-1], histD, '.', color=mBlue)
plt.xlabel('Log(Degree)')
plt.ylabel('Log(Number of occurences)')
plt.ylim(1e-5,1)
avg = np.mean(degrees)
ks = np.linspace(1,np.max(degrees),np.max(degrees))
plt.loglog(ks,PoissDegree(ks,avg),"-",color="k")
plt.loglog(ks,ScaleFreeDegree(ks,2),"-",color="r")
plt.legend(["Observations",
"Random network degree distribution",
"Scale-free network degree distribution ($\gamma = 2$)"])
base = 'Figures/loglogZoomDegreeDist'
if savingFigures:
plt.savefig(base+'.jpeg', bbox_inches='tight')
plt.savefig(base+'.png', bbox_inches='tight')
plt.savefig(base+'.svg', bbox_inches='tight')
plt.show()
Let's package the degree analysis in a function to run it over all the period graphs.
#######################
# Full Degree Analysis
#######################
def degreeAnalysis(graph,ttl):
# Get the degrees
degrees = [graph.degree(n) for n in graph.nodes()]
# Get the histograms
histD, binsD = histogram(degrees,True)
# Log-Log Plot zoom on scale free
plt.figure(figsize = (10,4))
plt.title("Log-Log Degree Distribution of period "+ttl)
plt.loglog(binsD[:-1], histD, '.', color=mBlue)
plt.xlabel('Log(Degree)')
plt.ylabel('Log(Number of occurences)')
plt.ylim(1e-5,1)
avg = np.mean(degrees)
ks = np.linspace(1,np.max(degrees),np.max(degrees))
plt.loglog(ks,PoissDegree(ks,avg),"-",color="k")
plt.loglog(ks,ScaleFreeDegree(ks,2),"-",color="r")
plt.legend(["Observations",
"Random network degree distribution",
"Scale-free network degree distribution ($\gamma = 2$)"])
base = 'Figures/loglogZoomDegreeDist_'+ttl
if savingFigures:
plt.savefig(base+'.jpeg', bbox_inches='tight')
plt.savefig(base+'.png', bbox_inches='tight')
plt.savefig(base+'.svg', bbox_inches='tight')
plt.show()
##########################
# Run the Degree Analysis
##########################
for key in graphByPeriod.keys():
degreeAnalysis(graphByPeriod[key],titles[key])
All the distribution shows that the small degree nodes behave like random networks whereas the large ones tends to act like scale-free networks. This illustrate the fact that Random Network under estimate the likeliness of nodes with very large degrees to exist. In this case, those nodes corresponds to actors playing in a high number of movies with, then, a high number of partners.
To seperate the graph, we want to run a communtity detection algorithm. For the community detection we used the Louvain algorithm due to the high modularity found by the algorithm [Orman, 2011].
##############################
# COMPUTE COMMUNITY DETECTION
##############################
def communityDetect(graph):
# We used the Louvain algorithm
partition = community.best_partition(graph)
# Compute the modularity
modularity = community.modularity(partition,graph)
s = "The partition made of the graph has a modularity of +"+str(round(modularity*100,3))
s +="% and contains "+str(len(set(partition.values())))+" communities."
print s
return partition,modularity
Let's use a function to draw the communities. This function also return the id of the biggest communities to study them afterwards.
#######################
# PLOT THE COMMUNITIES
#######################
# Define a color range to differentiate the communities
colorRange = np.asarray([mBlue,'#db3aAA',fRed,'#aac857','#32BBAA','#4020AA','#084c61','#407058','#177e89','#bbcdcd'])
##INPUT :
# graph to draw, corresponding communities, title to give,
# positions of the nodes, boolean (whether it is the full graph or not)
##OUTPUT :
# id of the communities big enough to be displayed
def drawCommunities(graph, partition, ttl, posi, full):
# Drawing
minComm = graph.number_of_nodes()/60 #biggest community limit
biggestCommunitiesLabel={}
fig = plt.figure(figsize=(15, 15))
nextColor =0 # Color count to go through the color range
for com in set(partition.values()) :
list_nodes = [nodes for nodes in partition.keys() # Go throught the communities built
if partition[nodes] == com]
if len(list_nodes)>minComm: # Pick a color for the big communities
col = nextColor % 10
color = colorRange[col]
nextColor +=1
shape = "o"
if nextColor>9:
shape = "^"
if nextColor>19:
shape = "s"
alph =1
labl = "Community "+str(com)+": "+str(len(list_nodes))+" members"
biggestCommunitiesLabel[com]=labl
nx.draw_networkx_nodes(graph, posi, list_nodes, node_size = 30,node_color = color, alpha=alph,node_shape=shape, label = labl)
else: # Small communities are drawn in grey to increase readablility
color = "#BBBBBB"
alph = 0.2
nx.draw_networkx_nodes(graph, posi, list_nodes, node_size = 20,node_color = color, alpha=alph)
nx.draw_networkx_edges(graph, posi, width = 0.1, arrows=False, alpha=0.5) # Draw the edges
plt.axis('off')
plt.legend()
if full:
plt.title('Actor Graph Communities', fontdict = title_font)
base = 'Figures/communities'
else:
plt.title('Communities for the period '+ttl, fontdict = title_font)
base = 'Figures/communities_'+ttl
if not savingFigures:
plt.savefig(base+'.jpeg', bbox_inches='tight')
plt.savefig(base+'.png', bbox_inches='tight')
plt.savefig(base+'.svg', bbox_inches='tight')
plt.show()
return biggestCommunitiesLabel
#############################################
# RUN COMMUNITY DETECTION FOR THE FULL GRAPH
#############################################
partG,modulG = communityDetect(G)
biggestCommunitiesLabel = drawCommunities(G,partG,"full",pos, True)
###############################################
# RUN COMMUNITY DETECTION FOR THE PERIOD GRAPH
###############################################
# Initiate diacts to store data
partDict = {}
modulDict = {}
biggestCommDict = {}
# Run the community detection
for key in graphByPeriod.keys():
p,m = communityDetect(graphByPeriod[key])
b = drawCommunities(graphByPeriod[key],p,titles[key], positionsPeriod[key],False)
partDict[key]=p
modulDict[key]=m
biggestCommDict[key]=b
The communities seem very clear in the full graph, for each period some communities seems clear but they are more mixed in the graphs.</br>
To have a better understanding of those communities, let's analyse which actors and movies they are composed with. We gonna study:
To study the sentiments we first gonna store sentiment analysis results in to a dictionary.
#####################
# GET THE SENTIMENTS
#####################
files = open("sentiment.txt")
s = files.readlines()[0]
sentimentDict = json.loads(s)
Let's build a function to analyse the communities. For a graph, a partition and a list of the communities, it build a report containing for all the large communities, the main actors and movies, basic statistics on age, rating and sentiments.
##########################
# COMMUNITIES EXPLANATION
##########################
def communityExplain(graph, partition, bigComms, classification, lim):
commString = "" # String containing all the infos collected
for com in bigComms.keys(): # Go throught the large communities for this graph
list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com] #collect the nodes of this com
grphCom = graph.subgraph(list_nodes) #Select the subgraph corresponding to the community
os = [] #Store the outgoing strenght to get the main actors
rating = [] #Store the ratings
age = [] #Store the age of the movies
movies = {} #Store the movies
for iD in grphCom.nodes(): # Go throught the nodes
edges = list(grphCom.edges(iD))
lMovies = []
ogStrength = 0
for e in edges: # Store the data get from every edges of the sub graph
ogStrength += grphCom.get_edge_data(*e)["weight"]
rating.append(grphCom.get_edge_data(*e)["avRating"])
age.append(grphCom.get_edge_data(*e)["avYear"])
m = grphCom.get_edge_data(*e)["movies"]
for i in m:
lMovies.append(i)
if i in movies.keys():
movies[i]+=1
else:
movies[i]=1
if not classification == "collaborations":
ogStrength = ogStrength/len(edges)
if len(set(lMovies))>lim:
os.append((graph.nodes[iD]["Name"],ogStrength))
# Start the txt log
commString += "\r\n \r\n####################################### \r\n"
commString += "Community: "+str(com) + "\r\n"
commString += "####################################### \r\n \r\n"
commString += "Members:"+str(len(list_nodes))+ "\r\n"
# Get and log the main actors
mainAc = sorted(os, key=getSecond, reverse = True)
s = "Main Actors: "
nActorsToPrint = 5
if len(mainAc) <5:
nActorsToPrint=len(mainAc)
for i in range(nActorsToPrint):
s+= mainAc[i][0]+" ("+str(mainAc[i][1])+" "+classification+") "
# Log the year and ratings
commString += s + "\r\n"
commString += "Year: "+str(round(np.min(age)))+'-'+str(round(np.max(age)))+" Av: "+str(round(np.mean(age),2))+" Std: "+str(round(np.std(age),2)) + "\r\n"
commString +="Ratings: "+str(round(np.min(rating),2))+'-'+str(round(np.max(rating),2))+" Av: "+str(round(np.mean(rating),2))+" Std: "+str(round(np.std(rating),2)) + "\r\n"
mainMovies = sorted(movies.items(), key= getSecond, reverse = True)
# Get and log the main movies
s = "Main Movies: "
for i in range(5):
if mainMovies[i][0] in movieNameDict.keys():
s+= movieNameDict[mainMovies[i][0]]+" ("+str(mainMovies[i][1]/2)+" times) "
commString += s + "\r\n"
# Get and log the sentiment
sentiment = []
for i in range(len(mainMovies)):
if mainMovies[i][0] in sentimentDict.keys():
sentiment.append((sentimentDict[mainMovies[i][0]][1]*1.0/sentimentDict[mainMovies[i][0]][0]))
s = "Sentiments: "+str(round(np.min(sentiment),5))+'-'+str(round(np.max(sentiment),5))+" Av: "+str(round(np.mean(sentiment),5))+" Std: "+str(round(np.std(sentiment),5)) + "\r\n"
commString += s + "\r\n"
# return the log built
return commString
#########################
# ANALYSE THE FULL GRAPH
#########################
files = io.open("Full_communityAnalysis.txt", mode="w", encoding="utf-8" )
s = communityExplain(G,partG,biggestCommunitiesLabel,"collaborations",1)
print s
files.write(s)
files.close()
The sentiment tends to be really close across the communities, but the ratings differ through the communities, old movies(6.4), french movies(6.62), indian movies(6.43) and japanese one(6.53) have better rating in average.
########################
# ANALYSE PERIOD GRAPHS
########################
files = io.open("communityAnalysis.txt", mode="w", encoding="utf-8" )
for key in graphByPeriod.keys(): #Go through all period graph
p = partDict[key]
m = modulDict[key]
b = biggestCommDict[key]
#Print the report
s1 = "\r\n####################################### \r\n####################################### \r\n\r\n"
s1 += "Period: "+titles[key]+ "\r\n" + "Modularity: "+str(m)+ "\r\n \r\n" +"####################################### \r\n####################################### \r\n"
print "#######################################"
print "#######################################"
print ""
print "Period: "+titles[key]
print "Modularity: "+str(m)
print ""
print "#######################################"
print "#######################################"
files.write(unicode(s1, "utf-8"))
s = communityExplain(graphByPeriod[key],p,b, "collaborations",7)
files.write(s)
print s
files.close()
Communities by period are harder to interpret because they are smaller. French, Indian, Chinese and Japanese movies are still pretty simple to distinguish, probably because those actors collaborate more ofter together than with other. As we have seen for the full graph the sentiment are very similar across all the communities but the rating can vary between communities, from 4.5 to 7 in average.</br> Here are the communities with the best average rating.
For now the weight of the edges corresponds to the number of collaboration two actors did together. We could imagine to change the weight for the average rating or the sentiment, and study how it changes.
First thing to do is to build ratings-based graph. Let's define a function to do so, in order to apply it to all the graphs we have.</br>
This function also gives a size dictionnary with sizes proportionnal to the average rating of actors, and a rating list to draw an histogram of the average rating of actors.
#########################
# Rating Weight Function
#########################
def ratingWeight(graph):
res = graph.copy() # Make a copy not to erase the graph
# prepare data storage
ratingDict = {}
ratingListMen = []
ratingListWomen = []
for n in res.nodes(): #go throught nodes
avRating = 0 # set counts
countNeighbors = 0
for n2 in res.neighbors(n): # go throught neighbors to study links
countNeighbors +=1
res[n][n2]["weight"]=res[n][n2]["avRating"] #update weight
avRating+=res[n][n2]["avRating"]
avRating = avRating/countNeighbors # Compute average rating
ratingDict[n] = [avRating,countNeighbors]
if res.nodes[n]["Gender"] == "M": #Update lists
ratingListMen.append(int(round(10*avRating)))
else:
ratingListWomen.append(int(round(10*avRating)))
return ratingDict,ratingListMen,ratingListWomen,res
#############################
# Apply it to the full graph
#############################
ratingDict_G,ratingListMen_G,ratingListWomen_G,G_ratings = ratingWeight(G)
With those data computed, we can draw an histogram of the rating amongst the actors according to their gender.
###########################
# Average Rating Histogram
###########################
def histProp(rMen_G,rWomen_G,full,ttl, mode):
# GET Histogram
histRatingMenG, binsRatingMenG = histogram(rMen_G,False)
histRatingWomenG, binsRatingWomenG = histogram(rWomen_G,False)
# Compute basic stat
avAvRatingMen = np.mean(rMen_G)
avAvRatingWomen = np.mean(rWomen_G)
# Plot
plt.figure(figsize = (15,6))
plt.bar([float(b/10) for b in binsRatingMenG[:-1]], histRatingMenG, 0.035, color=mBlue, label = "Average "+mode+" of Actors")
plt.bar([float(b/10)+0.04 for b in binsRatingWomenG[:-1]], histRatingWomenG, 0.035, color=fRed, label = "Average "+mode+" of Actress")
plt.plot([avAvRatingMen/10,avAvRatingMen/10],[0,np.max(histRatingMenG)+20], linestyle='-', linewidth = 1, color="#1dcaff", label= "Men Mean "+str(round(avAvRatingMen/10,2)))
plt.plot([avAvRatingWomen/10,avAvRatingWomen/10],[0,np.max(histRatingMenG)+20], linestyle='-', linewidth = 1, color="#ffc0cb", label= "Women Mean "+str(round(avAvRatingWomen/10,2)))
plt.xlabel('Degrees')
plt.ylabel('Number of occurences')
if full:
plt.title('Actors Average '+mode, fontdict=title_font)
else:
plt.title('Actors Average '+mode+' Period '+ttl, fontdict=title_font)
plt.legend()
if full:
base = 'Figures/actorsAverage'+mode
else:
base = 'Figures/actorsAverage'+mode+'_'+ttl
if savingFigures:
plt.savefig(base+'.jpeg', bbox_inches='tight')
plt.savefig(base+'.png', bbox_inches='tight')
plt.savefig(base+'.svg', bbox_inches='tight')
plt.show()
histProp(ratingListMen_G,ratingListWomen_G,True,"","Ratings")
The distributions seems the same for men and women, the average are really close also. Neverthe less the proportion of women with an average rating above 8 is bigger.
#################
# Plot the graph
#################
#Size Rendering Function to increase readability
def ratingSizeRendering(a,b,lim):
if b>lim:
return 0.5*((math.exp(a-5))**2+2)
else :
return 1 # for actors with less than 10 movies
#Size Rendering Function to increase readability For later
def sentSizeRendering(a,b,lim):
if b>lim:
return((math.exp((7*a-39)))+2)**1.7
else :
return 1 # for actors with less than 10 movies
#Label the actors with the best grades and more than 10 movies
def drawWithSizes(graph,ttl,posi,sizes,full, mode):
# minimum number of movies to be analysed
#(in order to exclude actor with to few data which won't be representative)
if full:
lim=10 # 10 movies for the full graph
else:
lim=7 # 7 for period graph
colors = getColors(graph) # Build the color and size arrays
#SIZE
# Get the actor/actress with the biggest number of collaborations
siz = []
for n in graph.nodes():
if sizes[n][1] >lim:
siz.append([n,sizes[n][0]])
SortedNames = np.asarray(sorted(siz,key=getSecond, reverse= True))
SortedNames = SortedNames[:10]
#PRINT THE 10 BEST ACTORS
print ""
print "########################"
print "Best Actors:"
print "------------"
print ""
for i in range(10):
print str(i+1)+" - "+actorNameDict[SortedNames[i,0]]+" - "+str(round(sizes[SortedNames[i,0]][0],2))
print ""
print "########################"
print ""
labels = {}
for n in graph.nodes():
name = actorNameDict[n]
if n in SortedNames[:,0]:
labels[n]="\n\n"+name+" - "+str(round(sizes[n][0],2))
else:
labels[n]=""
#POSITIONNING
positions = posi
#Node size
# Choose rendering function
if mode == "Ratings":
nodeSize=[ratingSizeRendering(sizes[n][0],sizes[n][1],lim) for n in graph.nodes()]
else:
nodeSize=[sentSizeRendering(sizes[n][0],sizes[n][1],lim) for n in graph.nodes()]
#PLOT
fig = plt.figure(figsize=(15, 15))
nx.draw_networkx(graph, positions,
node_size = nodeSize,
node_color = [colors[n] for n in graph.nodes()],
with_labels=True,
width = 0.1,edge_color='#999999',labels = labels,font_size = 12, font_weight= "bold")
plt.axis('off')
if full:
plt.title(mode+ "-Based: Actors Graph", fontdict = title_font )
base = 'Figures/actorGraph_'+mode+"_full"
else:
plt.title(mode+ "-Based: Actors Graph Period "+ttl, fontdict = title_font )
base = 'Figures/actorGraph_'+mode+"_"+ttl
if savingFigures:
plt.savefig(base+'.jpeg', bbox_inches='tight')
plt.savefig(base+'.png', bbox_inches='tight')
plt.savefig(base+'.svg', bbox_inches='tight')
plt.show()
drawWithSizes(G_ratings,"",pos,ratingDict_G,True,"Ratings")
################################
# Apply it to the period graphs
################################
#ratingDict_G,ratingListMen_G,ratingListWomen_G,G_ratings
ratingDict_period = {}
ratingListMen_period = {}
ratingListWomen_period = {}
ratingGraph_period = {}
for key in graphByPeriod.keys():
#Collect Data
d,m,w,g = ratingWeight(graphByPeriod[key])
ratingDict_period[key]=d
ratingListMen_period[key]=m
ratingListWomen_period[key]=w
ratingGraph_period[key]=g
#Histogram
histProp(m,w,False,titles[key],"Ratings")
#Graph
drawWithSizes(g,titles[key],positionsPeriod[key],d,False,"Ratings")
Drawing the graphs, we observe that some small group of actor really close can have together high grades as in the bottom of the old movies graph. It corresponds to actor that have made a lot of movies toghether that got good ratings. On the histograms, we see that men and women ratings are really similar.
Let's run the community detection with the weight as average ratings to see if new communities can be found using Louvain algorithm.
#############################################
# RUN COMMUNITY DETECTION FOR THE FULL GRAPH
#############################################
partRatingG,modulRatingG = communityDetect(G_ratings)
biggestCommunitiesLabelRating = drawCommunities(G_ratings,partRatingG,"full_Ratings",pos, True)
The communities found are very close to the ones found with the number of collaboration as weight. Community 14, we could not see before, (grey triangles) corresponds to Spannish actors.
#########################
# ANALYSE THE FULL GRAPH
#########################
files = io.open("Full_communityAnalysis_Ratings.txt", mode="w", encoding="utf-8" )
s = communityExplain(G_ratings,partRatingG,biggestCommunitiesLabelRating, "av. Rating",10)
print s
files.write(s)
files.close()
First thing to do is to build sentiment-based graph. Let's define a function to do so, in order to apply it to all the graphs we have.</br>
This function also gives a size dictionnary with sizes proportionnal to the average sentiment score of actors, and a sentiment list to draw an histogram of the average sentiment score of actors.
############################
# Sentiment Weight Function
############################
def sentimentWeight(graph):
res = graph.copy()
sentDict = {}
sentListMen = []
sentListWomen = []
for n in res.nodes(): #go through nodes
avSent = 0
countNeighbors = 0
for n2 in res.neighbors(n): #go through neighbors to go throught edges
countNeighbors +=1 #set counts
sentScore = 0
countMovies = 0
for m in res[n][n2]["movies"]: # go throught movies
countMovies +=1
sentScore += sentimentDict[m][1]*1.0/sentimentDict[m][0] # update sent score
sentScore = sentScore/countMovies
res[n][n2]["weight"]=sentScore
avSent+=sentScore
avSent = avSent/countNeighbors # compute the average sentiment
sentDict[n] = [avSent,countNeighbors]
if res.nodes[n]["Gender"] == "M": # update the lists
sentListMen.append(int(round(10*avSent)))
else:
sentListWomen.append(int(round(10*avSent)))
return sentDict,sentListMen,sentListWomen,res
#############################
# Apply it to the full graph
#############################
sentDict_G,sentListMen_G,sentListWomen_G,G_sent = sentimentWeight(G)
#histogram
histProp(sentListMen_G,sentListWomen_G,True,"","Sentiments")
#################
# Plot the graph
#################
drawWithSizes(G_sent,"",pos,sentDict_G,True,"Sentiments")
################################
# Apply it to the period graphs
################################
#ratingDict_G,ratingListMen_G,ratingListWomen_G,G_ratings
sentDict_period = {}
sentListMen_period = {}
sentListWomen_period = {}
sentGraph_period = {}
for key in graphByPeriod.keys():
#Collect Data
d,m,w,g = sentimentWeight(graphByPeriod[key])
sentDict_period[key]=d
sentListMen_period[key]=m
sentListWomen_period[key]=w
sentGraph_period[key]=g
#Histogram
histProp(m,w,False,titles[key],"Sentiments")
#Graph
drawWithSizes(g,titles[key],positionsPeriod[key],d,False,"Sentiments")
#############################################
# RUN COMMUNITY DETECTION FOR THE FULL GRAPH
#############################################
partSentG,modulSentG = communityDetect(G_sent)
biggestCommunitiesLabelSent = drawCommunities(G_sent,partSentG,"full_Sentiments",pos, True)
#########################
# ANALYSE THE FULL GRAPH
#########################
files = io.open("Full_communityAnalysis_SentimentBased.txt", mode="w", encoding="utf-8" )
s = communityExplain(G_sent,partSentG,biggestCommunitiesLabelSent, "av. Sentiment score",10)
print s
files.write(s)
files.close()
Sentiments scores range is really small and then we do not see a lot of difference between actors. Nonetheless the communtity graph is slightly different: we see the spanish community and there is more difference between old and new movies.