In [31]:
%matplotlib inline

import didactic_datamining as ddm
In [32]:
dataset = ddm.create_dataset(npoints=10, minvalue=0, maxvalue=10)
ddm.print_dataset(dataset)
P0 [4 1]
P1 [0 8]
P2 [6 7]
P3 [9 5]
P4 [7 7]
P5 [3 8]
P6 [7 9]
P7 [3 5]
P8 [8 7]
P9 [0 6]
In [33]:
kmeans = ddm.DidatticKMeans(K=2, centroid_indexs=(2, 5), dist=ddm.euclidean_distance)
kmeans.fit(dataset, step_by_step=False)
C0 (6.00, 7.00)
C1 (3.00, 8.00)
C0 (1.50, 6.75)
C1 (6.83, 6.00)
C0 (1.50, 6.75)
C1 (6.83, 6.00)
Out[33]:
<didattic_kmeans.DidatticKMeans instance at 0x107f41a28>
In [34]:
dbscan = ddm.DidatticDbscan(eps=1.8, min_pts=3)
dbscan.fit(dataset, step_by_step=False)
Core Points [4]
Border Points [8, 2]
Noise Points [0, 1, 3, 5, 6, 7, 9]
defaultdict(<type 'set'>, {0: set([8, 2, 4])})
In [35]:
hier = ddm.DidatticHierarchical()
hier.fit(dataset, link_criteria='min', use_distances=True, step_by_step=False)
iter 0
distance merge 0.00
[(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,)]
[[ 0.    8.06  6.32  6.4   6.71  7.07  8.54  4.12  7.21  6.4 ]
 [ 8.06  0.    6.08  9.49  7.07  3.    7.07  4.24  8.06  2.  ]
 [ 6.32  6.08  0.    3.61  1.    3.16  2.24  3.61  2.    6.08]
 [ 6.4   9.49  3.61  0.    2.83  6.71  4.47  6.    2.24  9.06]
 [ 6.71  7.07  1.    2.83  0.    4.12  2.    4.47  1.    7.07]
 [ 7.07  3.    3.16  6.71  4.12  0.    4.12  3.    5.1   3.61]
 [ 8.54  7.07  2.24  4.47  2.    4.12  0.    5.66  2.24  7.62]
 [ 4.12  4.24  3.61  6.    4.47  3.    5.66  0.    5.39  3.16]
 [ 7.21  8.06  2.    2.24  1.    5.1   2.24  5.39  0.    8.06]
 [ 6.4   2.    6.08  9.06  7.07  3.61  7.62  3.16  8.06  0.  ]]

iter 1
distance merge 1.00
[(2, 4, 8), (0,), (1,), (3,), (5,), (6,), (7,), (9,)]
[[ 0.    8.06  6.32  6.4   6.4   7.07  8.54  4.12]
 [ 8.06  0.    6.08  9.49  2.    3.    7.07  4.24]
 [ 6.32  6.08  0.    2.24  6.08  3.16  2.    3.61]
 [ 6.4   9.49  2.24  0.    9.06  6.71  4.47  6.  ]
 [ 6.4   2.    6.08  9.06  0.    3.61  7.62  3.16]
 [ 7.07  3.    3.16  6.71  3.61  0.    4.12  3.  ]
 [ 8.54  7.07  2.    4.47  7.62  4.12  0.    5.66]
 [ 4.12  4.24  3.61  6.    3.16  3.    5.66  0.  ]]

iter 2
distance merge 2.00
[((1,), (7,)), ((0,), (5,)), ((2, 4, 8),), ((3,),), ((6,),), ((9,),)]
[[ 0.    2.24  6.08  3.16  6.32  3.61]
 [ 2.24  0.    9.06  6.71  6.4   6.  ]
 [ 6.08  9.06  0.    3.    6.4   3.16]
 [ 3.16  6.71  3.    0.    7.07  3.  ]
 [ 6.32  6.4   6.4   7.07  0.    4.12]
 [ 3.61  6.    3.16  3.    4.12  0.  ]]

iter 3
distance merge 2.24
[(((0,), (5,)), ((1,), (7,))), (((2, 4, 8),),), (((3,),),), (((6,),),), (((9,),),)]
[[ 0.    6.08  3.61  3.16  6.32]
 [ 6.08  0.    3.16  3.    6.4 ]
 [ 3.61  3.16  0.    3.    4.12]
 [ 3.16  3.    3.    0.    7.07]
 [ 6.32  6.4   4.12  7.07  0.  ]]

iter 4
distance merge 3.00
[((((2, 4, 8),),), (((3,),),), (((6,),),)), ((((0,), (5,)), ((1,), (7,))),), ((((9,),),),)]
[[ 0.    6.32  3.16]
 [ 6.32  0.    4.12]
 [ 3.16  4.12  0.  ]]

iter 5
distance merge 3.16
[(((((2, 4, 8),),), (((3,),),), (((6,),),)), ((((9,),),),)), (((((0,), (5,)), ((1,), (7,))),),)]
[[ 0.    4.12]
 [ 4.12  0.  ]]

iter 6
distance merge 4.12
[((((((0,), (5,)), ((1,), (7,))),),), (((((2, 4, 8),),), (((3,),),), (((6,),),)), ((((9,),),),)))]
[[ 0.]]
In [36]:
transactions = ddm.create_transactional_dataset(num_transaction=10, num_items=6, min_len=2, max_len=4)
ddm.print_transactions(transactions)
['C', 'F']
['B', 'C', 'D', 'F']
['B', 'C', 'D', 'E']
['A', 'B', 'C', 'F']
['A', 'B', 'E']
['C', 'D', 'E', 'F']
['D', 'F']
['A', 'D']
['C', 'D', 'E', 'F']
['D', 'E']
In [37]:
apriori = ddm.DidatticApriori(min_sup=0.3, sup_type='r')
apriori.fit(transactions, step_by_step=False)
Apriori - Iteration 1
('A',) 0.30 
('B',) 0.40 
('C',) 0.60 
('D',) 0.70 
('E',) 0.50 
('F',) 0.60 
Apriori - Iteration 2
('A', 'B') 0.20 X
('A', 'C') 0.10 X
('A', 'D') 0.10 X
('A', 'E') 0.10 X
('A', 'F') 0.10 X
('B', 'C') 0.30 
('B', 'D') 0.20 X
('B', 'E') 0.20 X
('B', 'F') 0.20 X
('C', 'D') 0.40 
('C', 'E') 0.30 
('C', 'F') 0.50 
('D', 'E') 0.40 
('D', 'F') 0.40 
('E', 'F') 0.20 X
Apriori - Iteration 3
('C', 'D', 'E') 0.30 
('C', 'D', 'F') 0.30 
Apriori - Iteration 4
Out[37]:
<didattic_apriori.DidatticApriori instance at 0x1092d43b0>
In [38]:
apriori.extract_rules(min_conf=0.8)
('B',) --> ('C',) conf: 0.75 X 
('C',) --> ('B',) conf: 0.50 X 
('C',) --> ('D',) conf: 0.67 X 
('C',) --> ('D', 'E') conf: 0.50 X 
('C',) --> ('D', 'F') conf: 0.50 X 
('C',) --> ('E',) conf: 0.50 X 
('C',) --> ('E', 'D') conf: 0.50 X 
('C',) --> ('F',) conf: 0.83  lift: 1.39
('C',) --> ('F', 'D') conf: 0.50 X 
('C', 'D') --> ('E',) conf: 0.75 X 
('C', 'D') --> ('F',) conf: 0.75 X 
('C', 'E') --> ('D',) conf: 1.00  lift: 1.43
('C', 'F') --> ('D',) conf: 0.60 X 
('D',) --> ('C',) conf: 0.57 X 
('D',) --> ('C', 'E') conf: 0.43 X 
('D',) --> ('C', 'F') conf: 0.43 X 
('D',) --> ('E',) conf: 0.57 X 
('D',) --> ('E', 'C') conf: 0.43 X 
('D',) --> ('F',) conf: 0.57 X 
('D',) --> ('F', 'C') conf: 0.43 X 
('D', 'C') --> ('E',) conf: 0.00 X 
('D', 'C') --> ('F',) conf: 0.00 X 
('D', 'E') --> ('C',) conf: 0.75 X 
('D', 'F') --> ('C',) conf: 0.75 X 
('E',) --> ('C',) conf: 0.60 X 
('E',) --> ('C', 'D') conf: 0.60 X 
('E',) --> ('D',) conf: 0.80  lift: 1.14
('E',) --> ('D', 'C') conf: 0.60 X 
('E', 'C') --> ('D',) conf: 0.00 X 
('E', 'D') --> ('C',) conf: 0.00 X 
('F',) --> ('C',) conf: 0.83  lift: 1.39
('F',) --> ('C', 'D') conf: 0.50 X 
('F',) --> ('D',) conf: 0.67 X 
('F',) --> ('D', 'C') conf: 0.50 X 
('F', 'C') --> ('D',) conf: 0.00 X 
('F', 'D') --> ('C',) conf: 0.00 X 
In [39]:
import pandas as pd

dataset_df = pd.read_csv('dataset.csv', skipinitialspace=True, delimiter=',')

features = ['State', 'Contract', 'Sex', 'ServiceCalls', 'Minutes', 'Churn']
features = ['State', 'Contract', 'Sex', 'Churn']

dataset_df = dataset_df[features]
dataset_df
Out[39]:
State Contract Sex Churn
0 Italy Classic F YES
1 German Travel M NO
2 Italy Travel M YES
3 Italy Young F NO
4 German Travel F NO
5 German Classic M YES
6 German Classic M YES
7 German Young F YES
8 German Young M NO
9 German Travel F NO
In [40]:
tree = ddm.DidatticClassificationTree(fun=ddm.gini, fun_name='gini', 
                                      min_samples_split=2, min_samples_leaf=1, step_by_step=False)
tree.fit(dataset_df, target='Churn')
Root
Parent
	1 - ((1/2)^2 + (1/2)^2) = 1/2
State ['German', 'Italy']
	German 7
		NO, 4/7
		YES, 3/7
		1 - ((4/7)^2 + (3/7)^2) = 24/49
	Italy 3
		NO, 1/3
		YES, 2/3
		1 - ((1/3)^2 + (2/3)^2) = 4/9
	(0/7 * 7/10) + (0/3 * 3/10) = 0/10
	Delta Gain: 5/10 - 0/10 = 0/10

Contract ['Classic', 'Travel', 'Young']
	Classic 3
		YES, 3/3
		1 - ((1/1)^2) = 0
	Travel 4
		NO, 3/4
		YES, 1/4
		1 - ((3/4)^2 + (1/4)^2) = 3/8
	Young 3
		NO, 2/3
		YES, 1/3
		1 - ((2/3)^2 + (1/3)^2) = 4/9
	(0/4 * 4/10) + (0/3 * 3/10) + (0/3 * 3/10) = 0/10
Contract# ['Travel-Classic', 'Young']
	Travel-Classic 7
		NO, 3/7
		YES, 4/7
		1 - ((3/7)^2 + (4/7)^2) = 24/49
	Young 3
		NO, 2/3
		YES, 1/3
		1 - ((2/3)^2 + (1/3)^2) = 4/9
	(0/7 * 7/10) + (0/3 * 3/10) = 0/10
	Delta Gain: 5/10 - 0/10 = 0/10

Contract# ['Travel', 'Young-Classic']
	Travel 4
		NO, 3/4
		YES, 1/4
		1 - ((3/4)^2 + (1/4)^2) = 3/8
	Young-Classic 6
		NO, 2/6
		YES, 4/6
		1 - ((1/3)^2 + (2/3)^2) = 4/9
	(0/4 * 4/10) + (0/6 * 6/10) = 0/10
	Delta Gain: 5/10 - 0/10 = 0/10

Contract# ['Classic', 'Travel-Young']
	Classic 3
		YES, 3/3
		1 - ((1/1)^2) = 0
	Travel-Young 7
		NO, 5/7
		YES, 2/7
		1 - ((5/7)^2 + (2/7)^2) = 20/49
	(0/7 * 7/10) + (0/3 * 3/10) = 2/10
	Delta Gain: 5/10 - 2/10 = 0/10

Sex ['F', 'M']
	F 5
		NO, 3/5
		YES, 2/5
		1 - ((3/5)^2 + (2/5)^2) = 12/25
	M 5
		NO, 2/5
		YES, 3/5
		1 - ((2/5)^2 + (3/5)^2) = 12/25
	(0/5 * 5/10) + (0/5 * 5/10) = 0/10
	Delta Gain: 5/10 - 0/10 = 0/10

--> Split By Contract#Classic&Travel&Young 

Root_Contract#Classic&Travel&Young?Classic
Parent
	1 - ((1/1)^2) = 0
--> No gain 1. Stop

Root_Contract#Classic&Travel&Young?Travel
Parent
	1 - ((3/4)^2 + (1/4)^2) = 3/8
State ['German', 'Italy']
	German 3
		NO, 3/3
		1 - ((1/1)^2) = 0
	Italy 1
		YES, 1/1
		1 - ((1/1)^2) = 0
	(0/3 * 3/4) + (0 * 1/4) = 0/4
	Delta Gain: 0/4 - 0/4 = 0/4

Sex ['F', 'M']
	F 2
		NO, 2/2
		1 - ((1/1)^2) = 0
	M 2
		NO, 1/2
		YES, 1/2
		1 - ((1/2)^2 + (1/2)^2) = 1/2
	(1/2 * 2/4) + (0/2 * 2/4) = 1/4
	Delta Gain: 0/4 - 1/4 = 0/4

--> Split By State#German&Italy 

Root_Contract#Classic&Travel&Young?Travel_State#German&Italy?German
Parent
	1 - ((1/1)^2) = 0
--> No gain 1. Stop

Root_Contract#Classic&Travel&Young?Travel_State#German&Italy?Italy
Parent
	1 - ((1/1)^2) = 0
--> No gain 1. Stop

Root_Contract#Classic&Travel&Young?Young
Parent
	1 - ((2/3)^2 + (1/3)^2) = 4/9
State ['German', 'Italy']
	German 2
		NO, 1/2
		YES, 1/2
		1 - ((1/2)^2 + (1/2)^2) = 1/2
	Italy 1
		NO, 1/1
		1 - ((1/1)^2) = 0
	(1/2 * 2/3) + (0 * 1/3) = 1/3
	Delta Gain: 0/3 - 1/3 = 0/3

Sex ['F', 'M']
	F 2
		NO, 1/2
		YES, 1/2
		1 - ((1/2)^2 + (1/2)^2) = 1/2
	M 1
		NO, 1/1
		1 - ((1/1)^2) = 0
	(0 * 1/3) + (1/2 * 2/3) = 1/3
	Delta Gain: 0/3 - 1/3 = 0/3

--> Split By State#German&Italy 

Root_Contract#Classic&Travel&Young?Young_State#German&Italy?German
Parent
	1 - ((1/2)^2 + (1/2)^2) = 1/2
Sex ['F', 'M']
	F 1
		YES, 1/1
		1 - ((1/1)^2) = 0
	M 1
		NO, 1/1
		1 - ((1/1)^2) = 0
	(0 * 1/2) + (0 * 1/2) = 0/2
	Delta Gain: 1/2 - 0/2 = 1/2

--> Split By Sex#F&M 

Root_Contract#Classic&Travel&Young?Young_State#German&Italy?German_Sex#F&M?F
Parent
	1 - ((1/1)^2) = 0
--> No gain 1. Stop

Root_Contract#Classic&Travel&Young?Young_State#German&Italy?German_Sex#F&M?M
Parent
	1 - ((1/1)^2) = 0
--> No gain 1. Stop

Root_Contract#Classic&Travel&Young?Young_State#German&Italy?Italy
Parent
	1 - ((1/1)^2) = 0
--> No gain 1. Stop

In [41]:
test_df = pd.read_csv('test.csv', skipinitialspace=True, delimiter=',')
test_df = test_df[features]
test_df
Out[41]:
State Contract Sex Churn
0 Italy Classic F YES
1 German Travel M NO
2 Italy Travel F NO
In [42]:
prediction = tree.predict(test_df)
test_df['Predicted'] = prediction
test_df
Out[42]:
State Contract Sex Churn Predicted
0 Italy Classic F YES YES
1 German Travel M NO NO
2 Italy Travel F NO YES
In [43]:
tree.evaluate(test_df)
R\P	|NO	|YES	|
NO	|1	|1	|
YES	|0	|1	|
Precision 1/2 0.5
Recall 1 1.0
F1-measure 2/3 0.666666666667
Accuracy 2/3 0.666666666667
In [ ]:
 
In [ ]: