In [1]:
%matplotlib inline

import didactic_datamining as ddm
In [ ]:
#dataset = ddm.create_dataset(npoints=10, minvalue=0, maxvalue=10)
#ddm.print_dataset(dataset)

Exercise 1 - k-means (15 points)

a) Apply k-means to the dataset in the below table and figure using K=2, and the centroids c1=P2 and c2=P5. Explain what happens in any iteration (10 points). b) Discuss the reason of the k-means termination (3 points).
c) Identify another couple of initial centroids leading to the same clustering obtained in a) (2 points).

In [4]:
dataset = [[5, 4],
           [3, 2],
           [6, 2],
           [2, 3],
           [8, 9],
           [2, 0],
           [3, 7],
           [2, 9],
           [4, 5],
           [3, 3]]
In [5]:
kmeans = ddm.DidatticKMeans(K=2, centroid_indexs=(2, 5), dist=ddm.euclidean_distance)
kmeans.fit(dataset, step_by_step=False)
C0 (6.00, 2.00)
C1 (2.00, 0.00)
C0 (2.50, 2.00)
C1 (4.67, 6.00)
C0 (4.40, 6.80)
C1 (3.20, 2.00)
C0 (3.50, 2.33)
C1 (4.25, 7.50)
C0 (3.50, 2.33)
C1 (4.25, 7.50)
Out[5]:
<didactic_kmeans.DidatticKMeans instance at 0x108491440>
In [ ]:
 

Exercise 2 - Dbscan (15 points)

a) Apply Dbscan algorithm in the below table and figure with radius eps=1.8 and minPts=2 (1 neighbor plus the point itself) and for each point specify if it is a core point, border point or noise (10 points).
b) Indicate the composition of the clusters obtained (2 points).
c) Add the minimum number of points to transform the noise points in border points (3 points).

In [9]:
dataset = [[5, 4],
           [3, 2],
           [6, 2],
           [2, 3],
           [5, 5],
           [2, 0],
           [3, 6],
           [2, 9],
           [4, 5],
           [3, 3]]
In [10]:
dbscan = ddm.DidatticDbscan(eps=1.9, min_pts=3)
dbscan.fit(dataset, step_by_step=False)
Core Points [0, 1, 3, 4, 8, 9]
Border Points [6]
Noise Points [2, 5, 7]
defaultdict(<type 'set'>, {0: set([0, 8, 4, 6]), 1: set([1, 3, 9])})
In [ ]:
 

Exercise 3 - Decision Tree (25 points)

a) Use the training dataset below for building a decision tree based on misclassification rate for the variable “CHURN”, expanding the nodes of the tree until no split provides a gain (18 points).
b) Provide the confusion matrix and evaluate the accuracy, precision, recall and f1-measure of the tree with respect to the test set AND training set. You MUST provide the formulas of accuracy, precision, recall and f1-measure (7 Points).

In [36]:
import pandas as pd

dataset_df = pd.read_csv('dataset_compito_dm_20170612.csv', skipinitialspace=True, delimiter=',')
dataset_df
Out[36]:
State Contract Sex PreviousCompany Churn
0 Italy Classic F Tim YES
1 German Travel M Wind NO
2 France Travel M Tim YES
3 Italy Young F Tim NO
4 German Travel F Tim NO
5 Italy Classic M Tim YES
6 Italy Classic M Tim YES
7 German Young F Tim YES
8 German Young M Tim NO
9 France Travel F Tim NO
10 Italy Young M Tim YES
11 German Classic F Wind YES
12 France Young F Tim NO
13 France Classic F Tim NO
In [37]:
test_df = pd.read_csv('testset_compito_dm_20170612.csv', skipinitialspace=True, delimiter=',')
test_df
Out[37]:
State Contract Sex PreviousCompany Churn
0 Italy Classic F Tim YES
1 German Travel M Wind YES
2 Italy Travel M Wind YES
3 German Young F Tim YES
4 France Travel M Wind NO
5 France Classic F Tim NO
In [38]:
tree = ddm.DidatticClassificationTree(fun=ddm.error_rate, fun_name='misc rate', 
                                      min_samples_split=2, min_samples_leaf=1, step_by_step=False)
tree.fit(dataset_df, target='Churn')
Root
Parent
	1 - 7/14 = 7/14
State ['France', 'German', 'Italy']
	France 4
		NO, 3/4
		YES, 1/4
		1 - 3/4 = 1/4
	German 5
		NO, 3/5
		YES, 2/5
		1 - 3/5 = 2/5
	Italy 5
		NO, 1/5
		YES, 4/5
		1 - 4/5 = 1/5
	(2/5 * 5/14) + (1/5 * 5/14) + (1/4 * 4/14) = 4/14
State# ['France', 'German-Italy']
	France 4
		NO, 3/4
		YES, 1/4
		1 - 3/4 = 1/4
	German-Italy 10
		NO, 4/10
		YES, 6/10
		1 - 6/10 = 4/10
	(4/10 * 10/14) + (1/4 * 4/14) = 5/14
	Delta Gain: 7/14 - 5/14 = 2/14

State# ['German', 'Italy-France']
	German 5
		NO, 3/5
		YES, 2/5
		1 - 3/5 = 2/5
	Italy-France 9
		NO, 4/9
		YES, 5/9
		1 - 5/9 = 4/9
	(2/5 * 5/14) + (4/9 * 9/14) = 6/14
	Delta Gain: 7/14 - 6/14 = 1/14

State# ['German-France', 'Italy']
	German-France 9
		NO, 6/9
		YES, 3/9
		1 - 6/9 = 3/9
	Italy 5
		NO, 1/5
		YES, 4/5
		1 - 4/5 = 1/5
	(3/9 * 9/14) + (1/5 * 5/14) = 4/14
	Delta Gain: 7/14 - 4/14 = 3/14

Contract ['Classic', 'Travel', 'Young']
	Classic 5
		NO, 1/5
		YES, 4/5
		1 - 4/5 = 1/5
	Travel 4
		NO, 3/4
		YES, 1/4
		1 - 3/4 = 1/4
	Young 5
		NO, 3/5
		YES, 2/5
		1 - 3/5 = 2/5
	(1/4 * 4/14) + (2/5 * 5/14) + (1/5 * 5/14) = 4/14
Contract# ['Travel-Classic', 'Young']
	Travel-Classic 9
		NO, 4/9
		YES, 5/9
		1 - 5/9 = 4/9
	Young 5
		NO, 3/5
		YES, 2/5
		1 - 3/5 = 2/5
	(4/9 * 9/14) + (2/5 * 5/14) = 6/14
	Delta Gain: 7/14 - 6/14 = 1/14

Contract# ['Travel', 'Young-Classic']
	Travel 4
		NO, 3/4
		YES, 1/4
		1 - 3/4 = 1/4
	Young-Classic 10
		NO, 4/10
		YES, 6/10
		1 - 6/10 = 4/10
	(1/4 * 4/14) + (4/10 * 10/14) = 5/14
	Delta Gain: 7/14 - 5/14 = 2/14

Contract# ['Classic', 'Travel-Young']
	Classic 5
		NO, 1/5
		YES, 4/5
		1 - 4/5 = 1/5
	Travel-Young 9
		NO, 6/9
		YES, 3/9
		1 - 6/9 = 3/9
	(3/9 * 9/14) + (1/5 * 5/14) = 4/14
	Delta Gain: 7/14 - 4/14 = 3/14

Sex ['F', 'M']
	F 8
		NO, 5/8
		YES, 3/8
		1 - 5/8 = 3/8
	M 6
		NO, 2/6
		YES, 4/6
		1 - 4/6 = 2/6
	(2/6 * 6/14) + (3/8 * 8/14) = 5/14
	Delta Gain: 7/14 - 5/14 = 2/14

PreviousCompany ['Tim', 'Wind']
	Tim 12
		NO, 6/12
		YES, 6/12
		1 - 6/12 = 6/12
	Wind 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	(6/12 * 12/14) + (1/2 * 2/14) = 7/14
	Delta Gain: 7/14 - 7/14 = 0/14

--> Split By State#France&German&Italy 

Root_State#France&German&Italy?France
Parent
	1 - 3/4 = 1/4
Contract ['Classic', 'Travel', 'Young']
	Classic 1
		NO, 1/1
		1 - 1 = 0
	Travel 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	Young 1
		NO, 1/1
		1 - 1 = 0
	(1/2 * 2/4) + (0 * 1/4) + (0 * 1/4) = 1/4
Contract# ['Classic', 'Travel-Young']
	Classic 1
		NO, 1/1
		1 - 1 = 0
	Travel-Young 3
		NO, 2/3
		YES, 1/3
		1 - 2/3 = 1/3
	(1/3 * 3/4) + (0 * 1/4) = 1/4
	Delta Gain: 1/4 - 1/4 = 0/4

Contract# ['Travel-Classic', 'Young']
	Travel-Classic 3
		NO, 2/3
		YES, 1/3
		1 - 2/3 = 1/3
	Young 1
		NO, 1/1
		1 - 1 = 0
	(1/3 * 3/4) + (0 * 1/4) = 1/4
	Delta Gain: 1/4 - 1/4 = 0/4

Contract# ['Travel', 'Young-Classic']
	Travel 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	Young-Classic 2
		NO, 2/2
		1 - 2/2 = 0/2
	(1/2 * 2/4) + (0/2 * 2/4) = 1/4
	Delta Gain: 1/4 - 1/4 = 0/4

Sex ['F', 'M']
	F 3
		NO, 3/3
		1 - 3/3 = 0/3
	M 1
		YES, 1/1
		1 - 1 = 0
	(0 * 1/4) + (0/3 * 3/4) = 0/4
	Delta Gain: 1/4 - 0/4 = 1/4

--> Split By Contract#Classic&Travel&Young 

Root_State#France&German&Italy?France_Contract#Classic&Travel&Young?Classic
Parent
	1 - 1 = 0
--> No gain 1. Stop

Root_State#France&German&Italy?France_Contract#Classic&Travel&Young?Travel
Parent
	1 - 1/2 = 1/2
Sex ['F', 'M']
	F 1
		NO, 1/1
		1 - 1 = 0
	M 1
		YES, 1/1
		1 - 1 = 0
	(0 * 1/2) + (0 * 1/2) = 0/2
	Delta Gain: 1/2 - 0/2 = 1/2

--> Split By Sex#F&M 

Root_State#France&German&Italy?France_Contract#Classic&Travel&Young?Travel_Sex#F&M?F
Parent
	1 - 1 = 0
--> No gain 1. Stop

Root_State#France&German&Italy?France_Contract#Classic&Travel&Young?Travel_Sex#F&M?M
Parent
	1 - 1 = 0
--> No gain 1. Stop

Root_State#France&German&Italy?France_Contract#Classic&Travel&Young?Young
Parent
	1 - 1 = 0
--> No gain 1. Stop

Root_State#France&German&Italy?German
Parent
	1 - 3/5 = 2/5
Contract ['Classic', 'Travel', 'Young']
	Classic 1
		YES, 1/1
		1 - 1 = 0
	Travel 2
		NO, 2/2
		1 - 2/2 = 0/2
	Young 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	(0/2 * 2/5) + (1/2 * 2/5) + (0 * 1/5) = 1/5
Contract# ['Classic', 'Travel-Young']
	Classic 1
		YES, 1/1
		1 - 1 = 0
	Travel-Young 4
		NO, 3/4
		YES, 1/4
		1 - 3/4 = 1/4
	(1/4 * 4/5) + (0 * 1/5) = 1/5
	Delta Gain: 2/5 - 1/5 = 1/5

Contract# ['Travel-Classic', 'Young']
	Travel-Classic 3
		NO, 2/3
		YES, 1/3
		1 - 2/3 = 1/3
	Young 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	(1/3 * 3/5) + (1/2 * 2/5) = 2/5
	Delta Gain: 2/5 - 2/5 = 0/5

Contract# ['Travel', 'Young-Classic']
	Travel 2
		NO, 2/2
		1 - 2/2 = 0/2
	Young-Classic 3
		NO, 1/3
		YES, 2/3
		1 - 2/3 = 1/3
	(0/2 * 2/5) + (1/3 * 3/5) = 1/5
	Delta Gain: 2/5 - 1/5 = 1/5

Sex ['F', 'M']
	F 3
		NO, 1/3
		YES, 2/3
		1 - 2/3 = 1/3
	M 2
		NO, 2/2
		1 - 2/2 = 0/2
	(0/2 * 2/5) + (1/3 * 3/5) = 1/5
	Delta Gain: 2/5 - 1/5 = 1/5

PreviousCompany ['Tim', 'Wind']
	Tim 3
		NO, 2/3
		YES, 1/3
		1 - 2/3 = 1/3
	Wind 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	(1/3 * 3/5) + (1/2 * 2/5) = 2/5
	Delta Gain: 2/5 - 2/5 = 0/5

--> Split By Contract#Classic&Travel&Young 

Root_State#France&German&Italy?German_Contract#Classic&Travel&Young?Classic
Parent
	1 - 1 = 0
--> No gain 1. Stop

Root_State#France&German&Italy?German_Contract#Classic&Travel&Young?Travel
Parent
	1 - 2/2 = 0/2
--> No gain 1. Stop

Root_State#France&German&Italy?German_Contract#Classic&Travel&Young?Young
Parent
	1 - 1/2 = 1/2
Sex ['F', 'M']
	F 1
		YES, 1/1
		1 - 1 = 0
	M 1
		NO, 1/1
		1 - 1 = 0
	(0 * 1/2) + (0 * 1/2) = 0/2
	Delta Gain: 1/2 - 0/2 = 1/2

--> Split By Sex#F&M 

Root_State#France&German&Italy?German_Contract#Classic&Travel&Young?Young_Sex#F&M?F
Parent
	1 - 1 = 0
--> No gain 1. Stop

Root_State#France&German&Italy?German_Contract#Classic&Travel&Young?Young_Sex#F&M?M
Parent
	1 - 1 = 0
--> No gain 1. Stop

Root_State#France&German&Italy?Italy
Parent
	1 - 4/5 = 1/5
Contract ['Classic', 'Young']
	Classic 3
		YES, 3/3
		1 - 3/3 = 0/3
	Young 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	(1/2 * 2/5) + (0/3 * 3/5) = 1/5
	Delta Gain: 1/5 - 1/5 = 0/5

Sex ['F', 'M']
	F 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	M 3
		YES, 3/3
		1 - 3/3 = 0/3
	(0/3 * 3/5) + (1/2 * 2/5) = 1/5
	Delta Gain: 1/5 - 1/5 = 0/5

--> No gain 2. Stop

In [39]:
prediction = tree.predict(test_df)
test_df['Predicted'] = prediction
test_df
Out[39]:
State Contract Sex PreviousCompany Churn Predicted
0 Italy Classic F Tim YES YES
1 German Travel M Wind YES NO
2 Italy Travel M Wind YES YES
3 German Young F Tim YES YES
4 France Travel M Wind NO YES
5 France Classic F Tim NO NO
In [40]:
tree.evaluate(test_df)
R\P	|NO	|YES	|
NO	|1	|1	|
YES	|1	|3	|
Precision 3/4 0.75
Recall 3/4 0.75
F1-measure 3/4 0.75
Accuracy 2/3 0.666666666667
In [ ]: