Note
Click here to download the full example code
Possesion Chains
Create Possesion Chains
import pandas as pd
import numpy as np
import json
# plotting
import os
import pathlib
import warnings
import statsmodels.api as sm
import statsmodels.formula.api as smf
from mplsoccer import Pitch
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')
Opening the dataset
First we open the data. It is exactly the same way as we did earlier. However, this time we reset the index to have the indicies sorted.
df = pd.DataFrame()
for i in range(13):
file_name = 'events_England_' + str(i+1) + '.json'
path = os.path.join(str(pathlib.Path().resolve().parents[0]), 'data', 'Wyscout', file_name)
with open(path) as f:
data = json.load(f)
df = pd.concat([df, pd.DataFrame(data)], ignore_index = True)
df = df.reset_index()
Preparing data
First we see create a new column with the next event. We create a column with 1 if the ball was kicked out - to mark when the chain should be stopped. We filter out Interruptions since there was an ball out of the pitch or foul before them. Then, we filter lost duels, since Wyscout duels are saved twice - for both team either defensive or offensive. We filter out events “Out of ball” since we can get this information from other column. Then, we filter out events by goalkeepers, since shots were made before.
next_event = df.shift(-1, fill_value=0)
df["nextEvent"] = next_event["subEventName"]
df["kickedOut"] = df.apply(lambda x: 1 if x.nextEvent == "Ball out of the field" else 0, axis = 1)
#interruptions out
interruption = df.loc[df["eventName"] == "Interruption"]
#probably need to drop "others on the ball event" - nope
# filter out non-accurate duels - in wyscout they are 2 way - attacking and defending
lost_duels = df.loc[df["eventName"] == "Duel"]
lost_duels = lost_duels.loc[lost_duels.apply (lambda x:{'id':1802} in x.tags, axis = 1)]
df = df.drop(lost_duels.index)
# filter ball out of the field - I can get this anyways
out_of_ball = df.loc[df["subEventName"] == "Ball out of the field"]
df = df.drop(out_of_ball.index)
# save attempts can be dropped
goalies = df.loc[df["subEventName"].isin(["Goalkeeper leaving line", "Save attempt", "Reflexes"])]
df = df.drop(goalies.index)
Isolating possesion chain
Then, we isolate possesion chain with the rule that if the ball was touched once by a different team, it should not change the results. If there was a foul, ball kicked out of the field the chain is stopped. For others in the ball, if the ball was properly intercepted that is, the next team is the one that made the next event, we stop the chain. Otherwise, if the ball was only touched, but did not change possesion, we treat a pass as an accurate one. Note that this is an approximate of true possesion chain. In the industry you will work on datasets with possesion chains already isolated.
def isolateChains(df):
"""
Parameters
----------
df : dataframe
dataframe with Wyscout event data.
Returns
-------
df: dataframe
dataframe with isolated possesion chains
"""
df["nextTeamId"] = df.shift(-1, fill_value=0)["teamId"]
#potential +0s
chain_team = df.iloc[0]["teamId"]
period = df.iloc[0]["matchPeriod"]
stop_criterion = 0
chain = 0
df["possesion_chain"] = 0
df["possesion_chain_team"] = 0
for i, row in df.iterrows():
#add value
df.at[i, "possesion_chain"] = chain
df.at[i, "possesion_chain_team"] = chain_team
# if pass not accurate/lost duel, add 1 to stop criterion
if row["eventName"] == "Pass" or row["eventName"] == "Duel":
if row["teamId"] == chain_team and {"id": 1802} in row["tags"]:
stop_criterion += 1
if row["teamId"] != chain_team and {"id": 1801} in row["tags"]:
stop_criterion += 1
#if ball intercepted properly add 2
if row["eventName"] == "Others on the ball":
if row["teamId"] == row["nextTeamId"]:
stop_criterion += 2
#if shot, add 2 to stop criteriom
if row["eventName"] in ["Shot", "Foul", "Offside"]:
stop_criterion += 2
#if ball out of field, add 2
if row["kickedOut"] == 1:
stop_criterion += 2
#criterion for stopping when half ended
if row["matchPeriod"] != period:
chain += 1
stop_criterion = 0
chain_team = row['teamId']
period = row["matchPeriod"]
df.at[i, "possesion_chain"] = chain
df.at[i, "possesion_chain_team"] = chain_team
#possesion chain ended
if stop_criterion >= 2:
chain += 1
stop_criterion = 0
chain_team = row['nextTeamId']
return df
df = isolateChains(df)
#investigate a chain
df.loc[df["possesion_chain"] == 4][["eventName", "possesion_chain"]]
Calculating xG value
To calculate xG value of shots, we use the function from Lesson 2. However, this time we treat penalties as shots from the penalty spot. Then, we assign xG value to shots in df with all events.
def calulatexG(df):
"""
Parameters
----------
df : dataframe
dataframe with Wyscout event data.
Returns
-------
xG_sum: dataframe
dataframe with xG for each shot
"""
#very basic xG model based on
shots = df.loc[df["eventName"] == "Shot"].copy()
shots["X"] = shots.positions.apply(lambda cell: (100 - cell[0]['x']) * 105/100)
shots["Y"] = shots.positions.apply(lambda cell: cell[0]['y'] * 68/100)
shots["C"] = shots.positions.apply(lambda cell: abs(cell[0]['y'] - 50) * 68/100)
#calculate distance and angle
shots["Distance"] = np.sqrt(shots["X"]**2 + shots["C"]**2)
shots["Angle"] = np.where(np.arctan(7.32 * shots["X"] / (shots["X"]**2 + shots["C"]**2 - (7.32/2)**2)) > 0, np.arctan(7.32 * shots["X"] /(shots["X"]**2 + shots["C"]**2 - (7.32/2)**2)), np.arctan(7.32 * shots["X"] /(shots["X"]**2 + shots["C"]**2 - (7.32/2)**2)) + np.pi)
#if you ever encounter problems (like you have seen that model treats 0 as 1 and 1 as 0) while modelling - change the dependant variable to object
shots["Goal"] = shots.tags.apply(lambda x: 1 if {'id':101} in x else 0).astype(object)
#headers have id = 403
headers = shots.loc[shots.apply (lambda x:{'id':403} in x.tags, axis = 1)]
non_headers = shots.drop(headers.index)
headers_model = smf.glm(formula="Goal ~ Distance + Angle" , data=headers,
family=sm.families.Binomial()).fit()
#non-headers
nonheaders_model = smf.glm(formula="Goal ~ Distance + Angle" , data=non_headers,
family=sm.families.Binomial()).fit()
#assigning xG
df["xG"] = 0
b_head = headers_model.params
xG = 1/(1+np.exp(b_head[0]+b_head[1]*headers['Distance'] + b_head[2]*headers['Angle']))
headers = headers.assign(xG = xG)
for index, row in headers.iterrows():
df.at[index, "xG"] = row["xG"]
#non-headers
b_nhead = nonheaders_model.params
xG = 1/(1+np.exp(b_nhead[0]+b_nhead[1]*non_headers['Distance'] + b_nhead[2]*non_headers['Angle']))
non_headers = non_headers.assign(xG = xG)
for index, row in non_headers.iterrows():
df.at[index, "xG"] = row["xG"]
penalties = df.loc[df["subEventName"] == "Penalty"]
#treating penalties like shots
penalties["X"] = 11
#calculate distance and angle
penalties["Distance"] = 11
penalties["Angle"] = np.arctan(7.32 * 11 /(11**2 - (7.32/2)**2))
#if you ever encounter problems (like you have seen that model treats 0 as 1 and 1 as 0) while modelling - change the dependant variable to object
penalties["Goal"] = penalties.tags.apply(lambda x: 1 if {'id':101} in x else 0).astype(object)
penalties = penalties.assign(xG = xG)
for index, row in penalties.iterrows():
df.at[index, "xG"] = row["xG"]
return df
df = calulatexG(df)
#investigate a chain
df.loc[df["possesion_chain"].isin([3,4])][["eventName", "possesion_chain", "xG"]]
Finding chains that ended with shot
As the next step we find possesion chains that ended with shot. We assgin 1 to them. We also assign xG of the shot to all events in the chain. If a chain was stopped by a foul, also chain before is considered as one that ended with shot. We also keep only events made by possesion team in the chain.
def prepareChains(df):
"""
Parameters
----------
df : dataframe
dataframe with Wyscout event data.
Returns
-------
xG_sum: dataframe
dataframe with assigned values for chains
"""
df["shot_end"] = 0
#get number of chains
no_chains = max(df["possesion_chain"].unique())
indicies = []
for i in range(no_chains+1):
#all events get possesion chain
possesion_chain_df = df.loc[df["possesion_chain"] == i]
#check if the possesion chain is not empty
if len(possesion_chain_df) > 0:
#if ended with shot
if possesion_chain_df.iloc[-1]["eventName"] == "Shot":
#assign values
df.loc[df["possesion_chain"] == i, "shot_end"] = 1
xG = possesion_chain_df.iloc[-1]["xG"]
df.loc[df["possesion_chain"] == i, "xG"] = xG
#check if the previous ones did not end with foul
k = i-1
if k > 0:
try:
prev = df.loc[df["possesion_chain"] == k]
#create a loop if e.g. 2 chains before and 1 chain before didn;t end with shot
while prev.iloc[-1]["eventName"] == "Foul":
#assign value for them
df.loc[df["possesion_chain"] == k, "xG"] = xG
df.loc[df["possesion_chain"] == k, "shot_end"] = 1
k = k-1
prev = df.loc[df["possesion_chain"] == k]
except:
k = k-1
#get indiices of events made by possesion team
team_indicies = possesion_chain_df.loc[possesion_chain_df["teamId"] == possesion_chain_df.teamId.mode().iloc[0]].index.values.tolist()
indicies.extend(team_indicies)
df = df.loc[indicies]
return df
df = prepareChains(df)
df.loc[df["possesion_chain"].isin([3,4])][["eventName", "possesion_chain", "xG"]]
Preparing data for modelling
As the next step we prepare data for modelling. We filter out dodgy events without end coordinates and store coordinates in different columns. We also create variable c as the distance of line parallel to x-axis throught the middle of the pitch. Moreover, we set end of a shot as (105, 34) contrary to Wyscout, which tends to store their ends as one of the corners.
#filter out dodgy
df = df.loc[df.apply(lambda x: len(x.positions) == 2, axis = 1)]
#columns with coordinates
df["x0"] = df.positions.apply(lambda cell: (cell[0]['x']) * 105/100)
df["c0"] = df.positions.apply(lambda cell: abs(50 - cell[0]['y']) * 68/100)
df["x1"] = df.positions.apply(lambda cell: (cell[1]['x']) * 105/100)
df["c1"] = df.positions.apply(lambda cell: abs(50 - cell[1]['y']) * 68/100)
#assign (105, 0) to end of the shot
df.loc[df["eventName"] == "Shot", "x1"] = 105
df.loc[df["eventName"] == "Shot", "c1"] = 0
#for plotting
df["y0"] = df.positions.apply(lambda cell: (100 - cell[0]['y']) * 68/100)
df["y1"] = df.positions.apply(lambda cell: (100 - cell[1]['y']) * 68/100)
df.loc[df["eventName"] == "Shot", "y1"] = 34
Possesion chain that ended with a shot
Now we can plot possesion chains. Here is a one that ended with a shot. Note that actions that were not passes and not shots are marked wth a grey dotted line. In this case, these are duels and a free kick.
#plot possesion chain that ended with shot
chain = df.loc[df["possesion_chain"] == 4]
#get passes
passes = chain.loc[chain["eventName"].isin(["Pass"])]
#get events different than pass
not_pass = chain.loc[chain["eventName"] != "Pass"].iloc[:-1]
#shot is the last event of the chain (or should be)
shot = chain.iloc[-1]
#plot
pitch = Pitch(line_color='black',pitch_type='custom', pitch_length=105, pitch_width=68, line_zorder = 2)
fig, ax = pitch.grid(grid_height=0.9, title_height=0.06, axis=False,
endnote_height=0.04, title_space=0, endnote_space=0)
#passes
pitch.arrows(passes.x0, passes.y0,
passes.x1, passes.y1, color = "blue", ax=ax['pitch'], zorder = 3)
#shot
pitch.arrows(shot.x0, shot.y0,
shot.x1, shot.y1, color = "red", ax=ax['pitch'], zorder = 3)
#other passes like arrows
pitch.lines(not_pass.x0, not_pass.y0, not_pass.x1, not_pass.y1, color = "grey", lw = 1.5, ls = 'dotted', ax=ax['pitch'])
ax['title'].text(0.5, 0.5, 'Passes leading to a shot', ha='center', va='center', fontsize=30)
plt.show()
Possesion chain that did not end with a shot
And here is a one that did not end with a shot.
#plot possesion chain that ended with shot
chain = df.loc[df["possesion_chain"] == 0]
passes = chain.loc[chain["eventName"].isin(["Pass", "Free Kick"])].iloc[:-1]
not_pass = chain.loc[(chain["eventName"] != "Pass") & (chain["eventName"] != "Free Kick")].iloc[:-1]
bad_pass = chain.iloc[-1]
#we could have used better
pitch = Pitch(line_color='black',pitch_type='custom', pitch_length=105, pitch_width=68, line_zorder = 2)
fig, ax = pitch.grid(grid_height=0.9, title_height=0.06, axis=False,
endnote_height=0.04, title_space=0, endnote_space=0)
pitch.arrows(passes.x0, passes.y0,
passes.x1, passes.y1, color = "blue", ax=ax['pitch'], zorder = 3)
pitch.arrows(bad_pass.x0, bad_pass.y0,
bad_pass.x1, bad_pass.y1, color = "purple", ax=ax['pitch'], zorder = 3)
pitch.scatter(bad_pass.x1, bad_pass.y1, marker = 'x', color = "red", ax=ax['pitch'], zorder = 3, s= 200)
pitch.lines(not_pass.x0, not_pass.y0, not_pass.x1, not_pass.y1, color = "grey", lw = 1.5, ls = 'dotted', ax=ax['pitch'])
ax['title'].text(0.5, 0.5, 'Passes not ending in a shot', ha='center', va='center', fontsize=30)
plt.show()
Total running time of the script: ( 3 minutes 31.749 seconds)