import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import pandas as pd


#Carga dataset
data = load_boston()
#Visualiza variables
print(data.feature_names)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


#Formato de pandas
table_data = pd.DataFrame(data.data, columns=data.feature_names)
#Agregamos la clase a la tabla
table_data['Y'] = data.target


#Información del dataset
print(data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
.. topic:: References

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.


#Correlaciones lineales
table_data.corr()


#X
X_RM = table_data[['RM']].to_numpy()
#Y
Y = data.target


#Visualización
plt.scatter(X_RM, Y)
plt.title('Dataset completo')
plt.show()


#Separación de los datos
X_train, X_test, Y_train, Y_test = train_test_split(X_RM,Y, test_size=0.3)
#Longitud de entrenamiento
print(len(X_train))
#Longitud de evaluación
print(len(X_test))

354
152


#Visualización datos de entrenamiento
plt.scatter(X_train,Y_train)
plt.title('Train dataset')
plt.show()


#Visualización datos de evaluación
plt.scatter(X_test,Y_test)
plt.title('Test dataset')
plt.show()


#Se concatena una columna de 1's a X
Xb = np.concatenate((X_train, np.ones((len(X_train),1))), axis=1)
print(Xb.shape)

(354, 2)


#XX^-1 
term1 = np.linalg.inv(np.dot(Xb.T,Xb))
#XY
term2 = np.dot(Xb.T,Y_train)

#Vector solución
w = np.dot(term1,term2)


#Visualización en datos de entrenamieno
plt.scatter(X_train, Y_train)
plt.plot(X_train, np.dot(Xb,w), color='r')
plt.title('Regression (train data)')
plt.show()


#Visualización en datos de evaluación
plt.scatter(X_test, Y_test)
plt.plot(X_test, np.dot(X_test,w[0])+w[1], color='r')
plt.title('Regression (test data)')
plt.show()


#Error de entrenamiento
print('Error train:', np.linalg.norm(X_train-np.dot(Xb,w))**2/len(X_train))
#Error de evaluación
print('Error test:', np.linalg.norm(X_test-(np.dot(X_test,w[0])+w[1]))**2/len(X_test))

Error train: 114925.07598424297
Error test: 303.7210347942579


#Media empírica de Y
mu = Y.sum(0)/len(Y)
sigma = 1 #((Y-mu)**2).sum(0)/len(Y)

print(mu,sigma, w)

22.532806324110677 1 [  9.75243839 -38.46270134]


#Distribución empírica de Y
a = (1./np.sqrt(sigma*2*np.pi))*np.exp(-(Y-mu)**2/sigma)


#f(X)
y_hat = (w[0]*X_RM[:,0] + w[1])
#Media empírica de f(X)
mu_hat = y_hat.sum(0)/len(y_hat)
sigma_hat = 1 #((y_hat-mu_hat)**2).sum(0)/len(y_hat)

#Distribución empírica de f(X)
e = (1./np.sqrt(sigma_hat*2*np.pi))*np.exp(-(mu_hat-y_hat)**2/sigma_hat)

#Visualización de distribuciones
plt.scatter(y_hat, e/e.sum(0))
plt.scatter(Y, a/a.sum(0))
plt.show()


#Cross entropy
H = -(a*np.log(e)).sum(0)

print('Cross entropy:', H)

Cross entropy: 608.3540288467498


#todos los datos
X_all = data.data


#Se agrega una columna de 1's
Xb = np.concatenate((X_all, np.ones((len(X_RM),1))), axis=1)

#Primer término
term1 = np.linalg.inv(np.dot(Xb.T,Xb))
#Segundo término
term2 = np.dot(Xb.T,Y)

#Obtención de parámetros
W = np.dot(term1,term2)


#Error sobre todo los datos
print('Error:', ((np.dot(Xb,W)-Y)**2).sum(0)/len(Y))

Error: 21.894831181729202


pd.DataFrame(W.reshape(1,len(W)), columns=list(data.feature_names)+['bias'])


from mpl_toolkits.mplot3d import Axes3D

#Visualización de dos variables
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(table_data[['LSTAT']].to_numpy(), table_data[['NOX']].to_numpy(), Y)
plt.show()


#Visualización de una variable
plt.scatter(table_data[['LSTAT']].to_numpy(), Y)
plt.show()


from sklearn.datasets import load_diabetes

data = load_diabetes()
print(data.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, T-Cells (a type of white blood cells)
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, thyroid stimulating hormone
      - s5      ltg, lamotrigine
      - s6      glu, blood sugar level

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.
(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	Y
CRIM	1.000000	-0.200469	0.406583	-0.055892	0.420972	-0.219247	0.352734	-0.379670	0.625505	0.582764	0.289946	-0.385064	0.455621	-0.388305
ZN	-0.200469	1.000000	-0.533828	-0.042697	-0.516604	0.311991	-0.569537	0.664408	-0.311948	-0.314563	-0.391679	0.175520	-0.412995	0.360445
INDUS	0.406583	-0.533828	1.000000	0.062938	0.763651	-0.391676	0.644779	-0.708027	0.595129	0.720760	0.383248	-0.356977	0.603800	-0.483725
CHAS	-0.055892	-0.042697	0.062938	1.000000	0.091203	0.091251	0.086518	-0.099176	-0.007368	-0.035587	-0.121515	0.048788	-0.053929	0.175260
NOX	0.420972	-0.516604	0.763651	0.091203	1.000000	-0.302188	0.731470	-0.769230	0.611441	0.668023	0.188933	-0.380051	0.590879	-0.427321
RM	-0.219247	0.311991	-0.391676	0.091251	-0.302188	1.000000	-0.240265	0.205246	-0.209847	-0.292048	-0.355501	0.128069	-0.613808	0.695360
AGE	0.352734	-0.569537	0.644779	0.086518	0.731470	-0.240265	1.000000	-0.747881	0.456022	0.506456	0.261515	-0.273534	0.602339	-0.376955
DIS	-0.379670	0.664408	-0.708027	-0.099176	-0.769230	0.205246	-0.747881	1.000000	-0.494588	-0.534432	-0.232471	0.291512	-0.496996	0.249929
RAD	0.625505	-0.311948	0.595129	-0.007368	0.611441	-0.209847	0.456022	-0.494588	1.000000	0.910228	0.464741	-0.444413	0.488676	-0.381626
TAX	0.582764	-0.314563	0.720760	-0.035587	0.668023	-0.292048	0.506456	-0.534432	0.910228	1.000000	0.460853	-0.441808	0.543993	-0.468536
PTRATIO	0.289946	-0.391679	0.383248	-0.121515	0.188933	-0.355501	0.261515	-0.232471	0.464741	0.460853	1.000000	-0.177383	0.374044	-0.507787
B	-0.385064	0.175520	-0.356977	0.048788	-0.380051	0.128069	-0.273534	0.291512	-0.444413	-0.441808	-0.177383	1.000000	-0.366087	0.333461
LSTAT	0.455621	-0.412995	0.603800	-0.053929	0.590879	-0.613808	0.602339	-0.496996	0.488676	0.543993	0.374044	-0.366087	1.000000	-0.737663
Y	-0.388305	0.360445	-0.483725	0.175260	-0.427321	0.695360	-0.376955	0.249929	-0.381626	-0.468536	-0.507787	0.333461	-0.737663	1.000000

Regresión lineal¶

Preparación de los datos¶

Estimación de la regresión¶

Distribución estadística¶

Regresión lineal con múltiples variables¶

Ejercicio¶