目录
1. 下载数据集
2. 数据预处理
3. 模型训练与选择
4. 预测
1. 下载数据集
下载后数据如下:
FIFA World Cup | Kaggle
2. 数据预处理
reprocess_dataset() 方法是数据进行预处理。预处理过的数据如下:
save_dataset() 方法是对预处理过的数据,进行向量化。
完整代码如下:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
import joblib
root_path = "models"
def reprocess_dataset():
#load data
results = pd.read_csv('datasets/WorldCupMatches.csv', encoding='gbk')
#Adding goal difference and establishing who is the winner
winner = []
for i in range (len(results['Home Team Name'])):
if results ['Home Team Goals'][i] > results['Away Team Goals'][i]:
winner.append(results['Home Team Name'][i])
elif results['Home Team Goals'][i] < results ['Away Team Goals'][i]:
winner.append(results['Away Team Name'][i])
else:
winner.append('Draw')
results['winning_team'] = winner
#adding goal difference column
results['goal_difference'] = np.absolute(results['Home Team Goals'] - results['Away Team Goals'])
# narrowing to team patcipating in the world cup, totally there are 32 football teams in 2022
worldcup_teams = ['Qatar','Germany','Denmark', 'Brazil','France','Belgium', 'Serbia',
'Spain','Croatia', 'Switzerland', 'England','Netherlands', 'Argentina',' Iran',
'Korea Republic','Saudi Arabia', 'Japan', 'Uruguay','Ecuador','Canada',
'Senegal', 'Poland', 'Portugal','Tunisia', 'Morocco','Cameroon','USA',
'Mexico','Wales','Australia','Costa Rica', 'Ghana']
df_teams_home = results[results['Home Team Name'].isin(worldcup_teams)]
df_teams_away = results[results['Away Team Name'].isin(worldcup_teams)]
df_teams = pd.concat((df_teams_home, df_teams_away))
df_teams.drop_duplicates()
df_teams.count()
#dropping columns that wll not affect matchoutcomes
df_teams_new =df_teams[[ 'Home Team Name','Away Team Name','winning_team']]
print(df_teams_new.head() )
#Building the model
#the prediction label: The winning_team column will show "2" if the home team has won, "1" if it was a tie, and "0" if the away team has won.
df_teams_new = df_teams_new.reset_index(drop=True)
df_teams_new.loc[df_teams_new.winning_team == df_teams_new['Home Team Name'],'winning_team']=2
df_teams_new.loc[df_teams_new.winning_team == 'Draw', 'winning_team']=1
df_teams_new.loc[df_teams_new.winning_team == df_teams_new['Away Team Name'], 'winning_team']=0
print(df_teams_new.count() )
df_teams_new.to_csv('datasets/raw_train_data.csv', encoding='gbk', index =False)
def save_dataset():
df_teams_new = pd.read_csv('datasets/raw_train_data.csv', encoding='gbk')
feature = df_teams_new[[ 'Home Team Name','Away Team Name']]
vec = DictVectorizer(sparse=False)
print(feature.to_dict(orient='records'))
X =vec.fit_transform(feature.to_dict(orient='records'))
X = X.astype('int')
print("===")
print(vec.get_feature_names())
print(vec.feature_names_)
y = df_teams_new[[ 'winning_team']]
y =y.astype('int')
print(X.shape)
print(y.shape)
joblib.dump(vec, root_path+"/vec.joblib")
np.savez('datasets/train_data', x= X, y = y)
if __name__ == '__main__':
reprocess_dataset()
save_dataset();
3. 模型训练与选择
用不同的传统机器学习方法进行训练,训练后的模型比较
Model | Training Accuracy | Test Accuracy |
Logistic Regression | 67.40% | 61.60% |
SVM | 67.30% | 62.70% |
Naive Bayes | 65.50% | 63.80% |
Random Forest | 90.80% | 65.50% |
XGB | 75.30% | 62.00% |
可以看到随机森林模型在测试集上准确率最高,所以我们可以用它来做预测。
下面是完整训练代码:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import sklearn as sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
root_path = "models"
def get_dataset():
train_data = np.load('datasets/train_data.npz')
return train_data
def train_by_LogisticRegression(train_data):
X = train_data['x']
y = train_data['y']
# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
joblib.dump(logreg, root_path+'/LogisticRegression_model.joblib')
score = logreg.score(X_train, y_train)
score2 = logreg.score(X_test, y_test)
print("LogisticRegression Training set accuracy: ", '%.3f'%(score))
print("LogisticRegression Test set accuracy: ", '%.3f'%(score2))
def train_by_svm(train_data):
X = train_data['x']
y = train_data['y']
# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
model = svm.SVC(kernel='linear', verbose=True, probability=True)
model.fit(X_train, y_train)
joblib.dump(model, root_path+'/svm_model.joblib')
score = model.score(X_train, y_train)
score2 = model.score(X_test, y_test)
print("SVM Training set accuracy: ", '%.3f' % (score))
print("SVM Test set accuracy: ", '%.3f' % (score2))
def train_by_naive_bayes(train_data):
X = train_data['x']
y = train_data['y']
# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)
joblib.dump(model, root_path+'/naive_bayes_model.joblib')
score = model.score(X_train, y_train)
score2 = model.score(X_test, y_test)
print("naive_bayes Training set accuracy: ", '%.3f' % (score))
print("naive_bayes Test set accuracy: ", '%.3f' % (score2))
def train_by_random_forest(train_data):
X = train_data['x']
y = train_data['y']
# Separate train and test sets
X_train = X
y_train = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
model = RandomForestClassifier(criterion='gini', max_features='sqrt')
model.fit(X_train, y_train)
joblib.dump(model, root_path+'/random_forest_model.joblib')
score = model.score(X_train, y_train)
score2 = model.score(X_test, y_test)
print("random forest Training set accuracy: ", '%.3f' % (score))
print("random forest Test set accuracy: ", '%.3f' % (score2))
def train_by_xgb(train_data):
X = train_data['x']
y = train_data['y']
# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
model = XGBClassifier(use_label_encoder=False)
model.fit(X_train, y_train)
joblib.dump(model, root_path+'/xgb_model.joblib')
score = model.score(X_train, y_train)
score2 = model.score(X_test, y_test)
print("xgb Training set accuracy: ", '%.3f' % (score))
print("xgb Test set accuracy: ", '%.3f' % (score2))
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
# show_confusion_matrix(y_test, y_pred)
print(report)
def show_confusion_matrix(y_true, y_pred, pic_name = "confusion_matrix"):
confusion = confusion_matrix(y_true=y_true, y_pred=y_pred)
print(confusion)
sns.heatmap(confusion, annot=True, cmap= 'Blues', xticklabels=['0','1','2'], yticklabels=['0','1','2'], fmt = '.20g')
plt.xlabel('Predicted class')
plt.ylabel('Actual Class')
plt.title(pic_name)
# plt.savefig('pic/' + pic_name)
plt.show()
if __name__ == '__main__':
train_data = get_dataset()
train_by_LogisticRegression(train_data)
train_by_svm(train_data)
train_by_naive_bayes(train_data)
train_by_random_forest(train_data)
train_by_xgb(train_data)
4. 预测
执行下面预测代码,结果是Ecuador胜于Qatar, 英国队胜于伊朗队。
[2]
[[0.05 0.22033333 0.72966667]]
Probability of Ecuador winning: 0.730
Probability of Draw: 0.220
Probability of Qatar winning: 0.050
[2]
[[0.02342857 0.21770455 0.75886688]]
Probability of England winning: 0.759
Probability of Draw: 0.218
Probability of Iran winning: 0.023
完整代码
import joblib
worldcup_teams = ['Qatar','Germany','Denmark', 'Brazil','France','Belgium', 'Serbia',
'Spain','Croatia', 'Switzerland', 'England','Netherlands', 'Argentina',' Iran',
'Korea Republic','Saudi Arabia', 'Japan', 'Uruguay','Ecuador','Canada',
'Senegal', 'Poland', 'Portugal','Tunisia', 'Morocco','Cameroon','USA',
'Mexico','Wales','Australia','Costa Rica', 'Ghana']
root_path = "models"
def verify_team_name(team_name):
for worldcup_team in worldcup_teams:
if team_name==worldcup_team:
return True
return False
def predict(model_dir =root_path+'/LogisticRegression_model.joblib', team_a='France', team_b = 'Mexico'):
if not verify_team_name(team_a):
print(team_a, ' is not correct')
return
if not verify_team_name(team_b) :
print(team_b, ' is not correct')
return
logreg = joblib.load(model_dir)
input_x = [{'Home Team Name': team_a, 'Away Team Name': team_b}]
vec = joblib.load(root_path+"/vec.joblib")
input_x = vec.transform(input_x)
result = logreg.predict(input_x)
print(result)
result1 = logreg.predict_proba(input_x)
print(result1)
print('Probability of ',team_a , ' winning:', '%.3f'%result1[0][2])
print('Probability of Draw:', '%.3f' % result1[0][1])
print('Probability of ', team_b, ' winning:', '%.3f' % result1[0][0])
if __name__ == '__main__':
team_a = 'Ecuador'
team_b = 'Qatar'
predict('models/random_forest_model.joblib', team_a, team_b)
team_a = 'England'
team_b = ' Iran'
predict('models/random_forest_model.joblib', team_a, team_b)
5. 个人总结
特征少的可怜,如果可以加一些球员的信息和状态的特征会更好,数据也相对太少,如果可以把欧洲杯,亚洲杯,非洲杯和美洲杯的每届数据加入进来就好了。数据有点旧(1930年后所有数据),这也是没办法,因为没有其它数据了。
如果预测的是比分,是不是也可以用分类做呢?比如最多可以踢入4个球每个队,当然像西班牙队可以踢进去7个球太罕见了,我们可以忽略不计,哈哈。那就是25个类别。效果会不会好呢?有待检验。
6. 2018后新的数据
Year | Home Team Name | Away Team Name | winning_team |
2018 | France | Croatia | 2 |
2018 | Belgium | England | 2 |
2018 | Croatia | England | 2 |
2018 | France | Belgium | 2 |
2018 | Russia | Croatia | 1 |
2018 | Sweden | England | 0 |
2018 | Brazil | Belgium | 0 |
2018 | Uruguay | France | 0 |
2018 | Colombia | England | 1 |
2018 | Sweden | Switzerland | 2 |
2018 | Belgium | Japan | 2 |
2018 | Brazil | Mexico | 2 |
2018 | Croatia | Denmark | 1 |
2018 | Spain | Russia | 1 |
2018 | Uruguay | Portugal | 2 |
2018 | France | Argentina | 2 |
2018 | England | Belgium | 0 |
2018 | Paraguay | Tunisia | 0 |
2018 | Senegal | Colombia | 0 |
2018 | Japan | Poland | 0 |
2018 | Switzerland | Costa Rica | 1 |
2018 | Serbia | Brazil | 0 |
2018 | Mexico | Sweden | 0 |
2018 | Korea Republic | Germany | 2 |
2018 | Iceland | Croatia | 0 |
2018 | Nigeria | Argentina | 0 |
2018 | Denmark | France | 1 |
2018 | Australia | Peru | 0 |
2018 | IR Iran | Portugal | 1 |
2018 | Spain | Morocco | 1 |
2018 | Saudi Arabia | Egypt | 2 |
2018 | Uruguay | Russia | 2 |
2018 | Poland | Colombia | 0 |
2018 | Japan | Senegal | 1 |
2018 | England | Panama | 2 |
2018 | Germany | Sweden | 2 |
2018 | Korea Republic | Mexico | 0 |
2018 | Belgium | Tunisia | 2 |
2018 | Serbia | Switzerland | 0 |
2018 | Nigeria | Iceland | 2 |
2018 | Brazil | Costa Rica | 2 |
2018 | Argentina | Croatia | 0 |
2018 | France | Peru | 2 |
2018 | Denmark | Australia | 1 |
2018 | IR Iran | Spain | 0 |
2018 | Uruguay | Saudi Arabia | 2 |
2018 | Portugal | Morocco | 2 |
2018 | Russia | Egypt | 2 |
2018 | Poland | Senegal | 0 |
2018 | Colombia | Japan | 0 |
2018 | Tunisia | England | 0 |
2018 | Belgium | Panama | 2 |
2018 | Sweden | Korea Republic | 2 |
2018 | Brazil | Switzerland | 1 |
2018 | Germany | Mexico | 0 |
2018 | Costa Rica | Serbia | 0 |
2018 | Croatia | Nigeria | 2 |
2018 | Peru | Denmark | 0 |
2018 | Argentina | Iceland | 1 |
2018 | France | Australia | 2 |
2018 | Portugal | Spain | 1 |
2018 | Morocco | IR Iran | 0 |
2018 | Egypt | Uruguay | 0 |
2018 | Russia | Saudi Arabia | 2 |
2022 | Qatar | Ecuador | 0 |
2022 | England | Iran | 2 |
2022 | Senegal | Netherlands | 0 |
2022 | USA | Wales | 1 |
2022 | Argentina | Saudi Arabia | 0 |
2022 | Denmark | Tunisia | 1 |
2022 | Mexico | Poland | 1 |
2022 | France | Australia | 2 |
2022 | Morocco | Croatia | 1 |
2022 | Germany | Japan | 0 |
2022 | Spain | Costa Rica | 2 |
2022 | Belgium | Canada | 2 |
2022 | Switzerland | Cameroon | 2 |
2022 | Uruguay | Korea Republic | 1 |
2022 | Portugal | Ghana | 2 |
2022 | Brazil | Serbia | 2 |
2022 | Wales | Iran | 0 |
2022 | Qatar | Senegal | 0 |
2022 | Netherlands | Ecuador | 1 |
2022 | England | USA | 1 |
2022 | Tunisia | Australia | 0 |
2022 | Poland | Saudi Arabia | 2 |
2022 | France | Denmark | 2 |
2022 | Argentina | Mexico | 2 |
2022 | Japan | Costa Rica | 0 |
2022 | Belgium | Morocco | 0 |
2022 | Croatia | Canada | 2 |
2022 | Spain | Germany | 1 |