In [1]:
%matplotlib inline

import didactic_datamining as ddm
In [ ]:
#dataset = ddm.create_dataset(npoints=10, minvalue=0, maxvalue=10)
#ddm.print_dataset(dataset)

Exercise 1 - k-means (15 points)

a) Apply k-means to the dataset in the below table and figure using K=2, and the centroids c1=P2 and c2=P5. Explain what happens in any iteration (10 points). b) Discuss the reason of the k-means termination (3 points).
c) Identify another couple of initial centroids leading to the same clustering obtained in a) (2 points).

In [3]:
dataset = [[5, 4],
           [3, 2],
           [6, 2],
           [2, 3],
           [8, 9],
           [4, 0],
           [3, 7],
           [7, 9],
           [4, 5],
           [3, 3]]
In [5]:
kmeans = ddm.DidatticKMeans(K=2, centroid_indexs=(2, 5), dist=ddm.euclidean_distance)
kmeans.fit(dataset, step_by_step=False)
C0 (6.00, 2.00)
C1 (4.00, 0.00)
C0 (3.00, 2.00)
C1 (5.50, 6.00)
C0 (5.40, 6.80)
C1 (3.60, 2.00)
C0 (3.83, 2.33)
C1 (5.50, 7.50)
C0 (6.00, 8.33)
C1 (3.86, 2.71)
C0 (6.00, 8.33)
C1 (3.86, 2.71)
Out[5]:
<didactic_kmeans.DidatticKMeans instance at 0x108cfac68>
In [ ]:
 

Exercise 2 - Dbscan (15 points)

a) Apply Dbscan algorithm in the below table and figure with radius eps=1.8 and minPts=2 (1 neighbor plus the point itself) and for each point specify if it is a core point, border point or noise (10 points).
b) Indicate the composition of the clusters obtained (2 points).
c) Add the minimum number of points to transform the noise points in border points (3 points).

In [7]:
dataset = [[5, 4],
           [3, 2],
           [6, 2],
           [2, 3],
           [8, 9],
           [4, 0],
           [3, 7],
           [7, 9],
           [4, 5],
           [3, 3]]
In [15]:
dbscan = ddm.DidatticDbscan(eps=1.9, min_pts=2)
dbscan.fit(dataset, step_by_step=False)
Core Points [0, 1, 3, 4, 7, 8, 9]
Border Points []
Noise Points [2, 5, 6]
defaultdict(<type 'set'>, {0: set([0, 8]), 1: set([1, 3, 9]), 2: set([4, 7])})
In [ ]:
 

Exercise 3 - Decision Tree (25 points)

a) Use the training dataset below for building a decision tree based on misclassification rate for the variable “CHURN”, expanding the nodes of the tree until no split provides a gain (18 points).
b) Provide the confusion matrix and evaluate the accuracy, precision, recall and f1-measure of the tree with respect to the test set AND training set. You MUST provide the formulas of accuracy, precision, recall and f1-measure (7 Points).

In [23]:
import pandas as pd

dataset_df = pd.read_csv('dataset_compito_dm_20170405.csv', skipinitialspace=True, delimiter=',')
dataset_df
Out[23]:
State Contract Sex PreviousCompany Churn
0 Italy Classic F Tim YES
1 German Travel M Wind NO
2 Italy Travel M Wind YES
3 Italy Young F Tim NO
4 German Travel F Tim NO
5 Italy Classic M Tim YES
6 Italy Classic M Tim YES
7 German Young F Wind YES
8 German Young M Tim NO
9 German Travel F Tim NO
10 Italy Young M Tim YES
11 German Classic F Wind YES
In [25]:
test_df = pd.read_csv('testset_compito_dm_20170405.csv', skipinitialspace=True, delimiter=',')
test_df
Out[25]:
State Contract Sex PreviousCompany Churn
0 Italy Classic F Tim YES
1 German Travel M Wind NO
2 Italy Travel M Wind YES
3 German Young F Tim YES
In [16]:
tree = ddm.DidatticClassificationTree(fun=ddm.error_rate, fun_name='misc rate', 
                                      min_samples_split=2, min_samples_leaf=1, step_by_step=False)
tree.fit(dataset_df, target='Churn')
Root
Parent
	1 - 7/12 = 5/12
State ['German', 'Italy']
	German 6
		NO, 4/6
		YES, 2/6
		1 - 4/6 = 2/6
	Italy 6
		NO, 1/6
		YES, 5/6
		1 - 5/6 = 1/6
	(2/6 * 6/12) + (1/6 * 6/12) = 3/12
	Delta Gain: 5/12 - 3/12 = 2/12

Contract ['Classic', 'Travel', 'Young']
	Classic 4
		YES, 4/4
		1 - 4/4 = 0/4
	Travel 4
		NO, 3/4
		YES, 1/4
		1 - 3/4 = 1/4
	Young 4
		NO, 2/4
		YES, 2/4
		1 - 2/4 = 2/4
	(1/4 * 4/12) + (2/4 * 4/12) + (0/4 * 4/12) = 3/12
Contract# ['Travel-Classic', 'Young']
	Travel-Classic 8
		NO, 3/8
		YES, 5/8
		1 - 5/8 = 3/8
	Young 4
		NO, 2/4
		YES, 2/4
		1 - 2/4 = 2/4
	(3/8 * 8/12) + (2/4 * 4/12) = 5/12
	Delta Gain: 5/12 - 5/12 = 0/12

Contract# ['Travel', 'Young-Classic']
	Travel 4
		NO, 3/4
		YES, 1/4
		1 - 3/4 = 1/4
	Young-Classic 8
		NO, 2/8
		YES, 6/8
		1 - 6/8 = 2/8
	(1/4 * 4/12) + (2/8 * 8/12) = 3/12
	Delta Gain: 5/12 - 3/12 = 2/12

Contract# ['Classic', 'Travel-Young']
	Classic 4
		YES, 4/4
		1 - 4/4 = 0/4
	Travel-Young 8
		NO, 5/8
		YES, 3/8
		1 - 5/8 = 3/8
	(3/8 * 8/12) + (0/4 * 4/12) = 3/12
	Delta Gain: 5/12 - 3/12 = 2/12

Sex ['F', 'M']
	F 6
		NO, 3/6
		YES, 3/6
		1 - 3/6 = 3/6
	M 6
		NO, 2/6
		YES, 4/6
		1 - 4/6 = 2/6
	(2/6 * 6/12) + (3/6 * 6/12) = 5/12
	Delta Gain: 5/12 - 5/12 = 0/12

PreviousCompany ['Tim', 'Wind']
	Tim 8
		NO, 4/8
		YES, 4/8
		1 - 4/8 = 4/8
	Wind 4
		NO, 1/4
		YES, 3/4
		1 - 3/4 = 1/4
	(4/8 * 8/12) + (1/4 * 4/12) = 5/12
	Delta Gain: 5/12 - 5/12 = 0/12

--> Split By Contract#Classic&Travel&Young 

Root_Contract#Classic&Travel&Young?Classic
Parent
	1 - 4/4 = 0/4
--> No gain 1. Stop

Root_Contract#Classic&Travel&Young?Travel
Parent
	1 - 3/4 = 1/4
State ['German', 'Italy']
	German 3
		NO, 3/3
		1 - 3/3 = 0/3
	Italy 1
		YES, 1/1
		1 - 1 = 0
	(0/3 * 3/4) + (0 * 1/4) = 0/4
	Delta Gain: 1/4 - 0/4 = 1/4

Sex ['F', 'M']
	F 2
		NO, 2/2
		1 - 2/2 = 0/2
	M 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	(1/2 * 2/4) + (0/2 * 2/4) = 1/4
	Delta Gain: 1/4 - 1/4 = 0/4

PreviousCompany ['Tim', 'Wind']
	Tim 2
		NO, 2/2
		1 - 2/2 = 0/2
	Wind 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	(0/2 * 2/4) + (1/2 * 2/4) = 1/4
	Delta Gain: 1/4 - 1/4 = 0/4

--> Split By State#German&Italy 

Root_Contract#Classic&Travel&Young?Travel_State#German&Italy?German
Parent
	1 - 3/3 = 0/3
--> No gain 1. Stop

Root_Contract#Classic&Travel&Young?Travel_State#German&Italy?Italy
Parent
	1 - 1 = 0
--> No gain 1. Stop

Root_Contract#Classic&Travel&Young?Young
Parent
	1 - 2/4 = 2/4
State ['German', 'Italy']
	German 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	Italy 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	(1/2 * 2/4) + (1/2 * 2/4) = 2/4
	Delta Gain: 2/4 - 2/4 = 0/4

Sex ['F', 'M']
	F 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	M 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	(1/2 * 2/4) + (1/2 * 2/4) = 2/4
	Delta Gain: 2/4 - 2/4 = 0/4

PreviousCompany ['Tim', 'Wind']
	Tim 3
		NO, 2/3
		YES, 1/3
		1 - 2/3 = 1/3
	Wind 1
		YES, 1/1
		1 - 1 = 0
	(1/3 * 3/4) + (0 * 1/4) = 1/4
	Delta Gain: 2/4 - 1/4 = 1/4

--> Split By PreviousCompany#Tim&Wind 

Root_Contract#Classic&Travel&Young?Young_PreviousCompany#Tim&Wind?Tim
Parent
	1 - 2/3 = 1/3
State ['German', 'Italy']
	German 1
		NO, 1/1
		1 - 1 = 0
	Italy 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	(0 * 1/3) + (1/2 * 2/3) = 1/3
	Delta Gain: 1/3 - 1/3 = 0/3

Sex ['F', 'M']
	F 1
		NO, 1/1
		1 - 1 = 0
	M 2
		NO, 1/2
		YES, 1/2
		1 - 1/2 = 1/2
	(1/2 * 2/3) + (0 * 1/3) = 1/3
	Delta Gain: 1/3 - 1/3 = 0/3

--> No gain 2. Stop

Root_Contract#Classic&Travel&Young?Young_PreviousCompany#Tim&Wind?Wind
Parent
	1 - 1 = 0
--> No gain 1. Stop

In [20]:
prediction = tree.predict(test_df)
test_df['Predicted'] = prediction
test_df
Out[20]:
State Contract Sex PreviousCompany Churn Predicted
0 Italy Classic F Tim YES YES
1 German Travel M Wind NO NO
2 Italy Travel M Wind YES YES
3 German Young F Tim YES NO
In [21]:
tree.evaluate(test_df)
R\P	|NO	|YES	|
NO	|1	|0	|
YES	|1	|2	|
Precision 1 1.0
Recall 2/3 0.666666666667
F1-measure 4/5 0.8
Accuracy 3/4 0.75
In [ ]: