Type Markdown and LaTeX:
from __future__ import print_function, division
import geopandas as gp
from shapely.geometry import Point
import pandas as pd
import os
import json
import pylab as pl
%pylab inline
from pandas.tools.plotting import scatter_matrix
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.decomposition import PCA
# !wget "http://www.nyc.gov/html/gbee/downloads/excel/nyc_benchmarking_disclosure_data_reported_in_2016.xlsx"
# read data
df = pd.read_excel("nyc_benchmarking_disclosure_data_reported_in_2016.xlsx")
df.to_csv(r'nyc_benchmarking_disclosure_data_reported_in_2016.csv', encoding = 'utf-8')
df.columns
df_csv = pd.read_csv('nyc_benchmarking_disclosure_data_reported_in_2016.csv' , index_col="Unnamed: 0")\
.iloc[:,[2,11,14,22,24,25,27,28,29,32,33,49,50,52]]
df_csv.head(5)
print ("the original dataset is {:d} observations by {:d} variables".\
format(*df_csv.shape))
df_csv.columns
# df_csv.set_index(keys=["Zip Code"], inplace=True)
df_csv['Zip Code'] = pd.to_numeric(df_csv['Zip Code'], errors='coerce')
# df_csv.index.dropna()
# df_csv.index.astype(int)
df_csv.head()
droped_zip = df_csv.dropna( subset = ['Zip Code'])
droped_zip.shape
def diff(df1, df2):
difference = df1.shape[0] - df2.shape[0]
difference1 = df1.shape[1] - df2.shape[1]
per = (difference *1.0 / df1.shape[0] *1.0) * 100
per2 = (difference1 *1.0 / df1.shape[1] * 1.0 ) * 100
if difference == 0 & difference1 == 0:
print("Nothing difference")
else:
print ("the original raw data set is",df1.shape[0]," and original columns data set is", df1.shape[1],"the droped row dataset is " ,difference, "and drop columns is ", difference1, \
"And the difference between original and droped row ", per, "% columns", per2,"%" )
diff(df_csv, droped_zip)
droped_zip.describe()
#### We have two area reported one and DOF one. We need to find each one is resonable to use it. The DOF and self reported area has very similar mean and median. However, self-reported minimum value is 0, which is oviously wrong. I will look into difference between two area
Area =pd.DataFrame(droped_zip.iloc[:,7]-droped_zip.iloc[:,8], columns = ['Diff_DOFandSelf'])
Area.head()
print ("the Area different of 99% interval is [{:.2f} , {:.2f}] in ft\2 \
.Also, we knew self-report has 0 ft^2 in minimum so I will chose DOF area.".format(Area.Diff_DOFandSelf.quantile(0.005),
Area.Diff_DOFandSelf.quantile(0.995)))
### for visualize the area different histogram
ax = (Area.Diff_DOFandSelf).plot(kind="hist", bins = 10000)
ax.set_xlabel("Area difference")
ax.set_xlim([Area.Diff_DOFandSelf.quantile(0.005), Area.Diff_DOFandSelf.quantile(0.995)])
droped_zip.head()
EUI =pd.DataFrame(droped_zip.iloc[:,7]-droped_zip.iloc[:,9], columns = ['Diff_EUI'])
EUI.head()
print ("the EUI different of 95% interval is [{:.2f} , {:.2f}] in kBtu \
.".format(EUI.Diff_EUI.quantile(0.025),
EUI.Diff_EUI.quantile(0.99)))
### for visualize the area different histogram
ax = (EUI.Diff_EUI).plot(kind="hist", bins = 10000)
ax.set_xlabel("EUI difference(kBtu/ft^2)")
ax.set_xlim([EUI.Diff_EUI.quantile(0.025), EUI.Diff_EUI.quantile(0.99)])
##### the source EUI is alwasy bigger than SITE EUI, therefore, I will chose source EUI
#### multiplying the Area and Site EUI(kBtu/ft^2) and also weather normalized energy * the Area
droped_zip['Source_EUI(kBtu)'] = droped_zip.iloc[:,9] * droped_zip.iloc[:,11]
droped_zip['Normalized_Source_EUI(kBtu)'] = droped_zip.iloc[:,10] * droped_zip.iloc[:,11]
droped_zip.head()
#### drop NAN values of the two SITE_EUI and Weather Normalized EUI
droped_Source = droped_zip.dropna(
subset = ['Source_EUI(kBtu)']
)
diff(droped_zip,droped_Source)
wheater_cleaned_Source = droped_Source.dropna(
subset = ['Normalized_Source_EUI(kBtu)']
)
diff(droped_Source, wheater_cleaned_Source)
wheater_cleaned_Source.head()
#### Metered Area also has to be Whole Building so I need to clean up the data
Clean_Metered = wheater_cleaned_Source[wheater_cleaned_Source['Metered Areas (Energy)'] == 'Whole Building']
diff(wheater_cleaned_Source, Clean_Metered)
#### Let's check either Source_EUI and Normalized_Source_EUI has weired value by using describe fucntion
#### Let's look up what kind of Property Usage is most frequent.
Clean_Metered['Primary Property Type - Self Selected'].value_counts()
#### Let's groupby Multifamily Housing and Office because the two properties are most of the data set
Multifamily = Clean_Metered[Clean_Metered['Primary Property Type - Self Selected'] == 'Multifamily Housing']
Office = Clean_Metered[Clean_Metered['Primary Property Type - Self Selected'] == 'Office']
# print ("the original dataset is {:d} observations by {:d} variables".\
# format(*df_csv.shape))
print('Total Mutifamily is {:d} and Total Office is {:d}.'.format(Multifamily.shape[0], Office.shape[0]))
Multifamily.describe()
#### The Source_EUI and Normalized_Source_EUI mean and STD are very similar so we can look up either one. However, still check the distribution shape, First. Also, the STD is very large than mean and 75% or 25% Therefore, the histogram would be weird
ax = Multifamily['Source_EUI(kBtu)'].plot(kind="hist", bins = 10, legend = 'Not Normalized')
ax.set_xlabel("Source EUI(kBtu)")
ax = Multifamily['Normalized_Source_EUI(kBtu)'].plot(kind="hist", bins = 10, legend = "Normalized", alpha =0.5)
ax.set_xlabel('NOrmalzied_Source_EUI(kBtu)')
#### because of the STD is too huge the histogram doesn't make sense. Let's take log10. However, np.log10(0) gives -inf so we need to clean again.
Multifamily = Multifamily[Multifamily['Source_EUI(kBtu)'] > 0]
Multifamily = Multifamily[Multifamily['Normalized_Source_EUI(kBtu)'] > 0]
Multifamily['log_Source_EUI'] = np.log10(Multifamily.iloc[:,14])
Multifamily['Normalized_log_Source_EUI'] = np.log10(Multifamily.iloc[:,15])
Multifamily.head()
# nlMultifamily = Multifamily[np.log10(Multifamily['Normalized_SITE_EUI(kBtu)'])]
Multifamily.columns
source = stats.kstest(stats.kstest(Multifamily['log_Source_EUI'],'norm'), lambda x: stats.norm.cdf(x, loc=Multifamily['log_Source_EUI'].describe()[1],scale=Multifamily['log_Source_EUI'].describe()[2]))
Normalsource = stats.kstest(stats.kstest(Multifamily['Normalized_log_Source_EUI'],'norm'), lambda x: stats.norm.cdf(x, loc=Multifamily['Normalized_log_Source_EUI'].describe()[1],scale=Multifamily['Normalized_log_Source_EUI'].describe()[2]))
print ("log_Source and Normalized_log_Source EUI has {:f}% and {:1f}% compare to a normal distribution. therefore I will clean with below and above 5% and 95% rather than using 2 * STD.".format(source[1], Normalsource[1]))
nlMultifamily = Multifamily[(Multifamily['log_Source_EUI'] > (Multifamily['log_Source_EUI'].quantile(0.05))) \
& (Multifamily['log_Source_EUI'] < Multifamily['log_Source_EUI'].quantile(0.95))]
diff(Multifamily,nlMultifamily)
nlMultifamily['log_Source_EUI'].describe()
ax = nlMultifamily['log_Source_EUI'].plot(kind="hist", bins = 100, legend = True)
ax.set_xlabel("log_Source EUI")
ax.set_xlim([6.65,7.81])
NormalMultifamily = Multifamily[(Multifamily['Normalized_log_Source_EUI'] > (Multifamily['Normalized_log_Source_EUI'].quantile(0.05))) \
& (Multifamily['Normalized_log_Source_EUI'] < Multifamily['Normalized_log_Source_EUI'].quantile(0.95))]
NormalMultifamily['Normalized_log_Source_EUI'].describe()
diff(Multifamily,NormalMultifamily)
ax = NormalMultifamily['Normalized_log_Source_EUI'].plot(kind="hist", bins = 200, legend = True)
ax.set_xlabel("Wheater Normalized log Source EUI")
ax.set_xlim([6.688,7.8])
#### Let's find coreleation between the two EUI and Energy star
nlMultifamily.columns
nlMultifamily.rename(columns={'ENERGY STAR Score': 'Escore'}, inplace=True)
nlMultifamily.head()
NormalMultifamily.rename(columns={'ENERGY STAR Score': 'Escore'}, inplace=True)
NormalMultifamily.head()
EngVSlogEUI=smf.ols('Escore ~ log_Source_EUI',nlMultifamily).fit()
EngVSlogEUI.summary()
nlMultifamily.shape
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(nlMultifamily['log_Source_EUI'], nlMultifamily['Escore'], 'o', label="data")
# ax.plot(nlMultifamily['log_SITE_EUI'], EngVSlogEUI.predict(), 'r--.', label="OLS")
df1_ = nlMultifamily.sort_values(by='log_Source_EUI')
ax.plot(df1_.log_Source_EUI, EngVSlogEUI.predict(df1_), color="DarkOrange", label='OLS')
ax.legend(loc='best');
ax.set_xlabel("Log_Source_EUI")
ax.set_ylabel("Energy Star Score")
ax.legend(loc='best');
ax.set_xlim(6.6 , 7.82)
ax.set_ylim(0 , 102)
ax.set_xlabel("Log_Source_EUI")
ax.set_ylabel("Energy Star Score")
EngVSlogNormal = smf.ols('Escore ~ Normalized_log_Source_EUI', data = NormalMultifamily).fit()
EngVSlogNormal.summary()
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(NormalMultifamily['Normalized_log_Source_EUI'], NormalMultifamily['Escore'], 'o', label="data")
df3_ = NormalMultifamily.sort_values(by='Normalized_log_Source_EUI')
ax.plot(df3_.Normalized_log_Source_EUI, EngVSlogNormal.predict(df3_), color="Red", label='OLS')
ax.set_xlabel("Normalized_log_Source_EUI")
ax.set_ylabel("Energy Star Score")
ax.set_xlim(6.6 , 7.82)
ax.set_ylim(0 , 102)
ax.legend(loc='best');
Office.describe()
ax = Office['Source_EUI(kBtu)'].plot(kind="hist", bins = 3000, legend = True)
ax.set_xlabel("Source EUI(kBtu)")
ax.set_xlim([0, 0.5e+9])
ax = Office['Normalized_Source_EUI(kBtu)'].plot(kind="hist", bins = 3000, legend = True)
ax.set_xlabel("Normalized_Source_EUI(kBtu)")
ax.set_xlim([0, 0.5e+9])
#### It doesn't look like a normal distribution at all
Office = Office[Office['Source_EUI(kBtu)'] > 0]
Office = Office[Office['Normalized_Source_EUI(kBtu)'] > 0]
Office['log_Source_EUI'] = np.log10(Office.iloc[:,14])
Office['Normalized_log_Source_EUI'] = np.log10(Office.iloc[:,15])
Office.head()
Office.describe()
nlofficeSource = stats.kstest(stats.kstest(Office['log_Source_EUI'],'norm'), lambda x: stats.norm.cdf(x, loc=Office['log_Source_EUI'].describe()[1],scale=Office['log_Source_EUI'].describe()[2]))
nlNormaloffice = stats.kstest(stats.kstest(Office['Normalized_log_Source_EUI'],'norm'), lambda x: stats.norm.cdf(x, loc=Office['Normalized_log_Source_EUI'].describe()[1],scale=Office['Normalized_log_Source_EUI'].describe()[2]))
print ("the log source EUI p-value : {:2f} and Normalized log Source EUI : {:2f} so we can not treat as a normal distribution ".format(nlofficeSource[1], nlNormaloffice[1]))
nloffice = Office[(Office['log_Source_EUI'] > (Office['log_Source_EUI'].quantile(0.05))) \
& (Office['log_Source_EUI'] < Office['log_Source_EUI'].quantile(0.95))]
nlNormaloffice = Office[(Office['Normalized_log_Source_EUI'] > (Office['Normalized_log_Source_EUI'].quantile(0.05))) \
& (Office['Normalized_log_Source_EUI'] < Office['Normalized_log_Source_EUI'].quantile(0.95))]
diff(Office,nloffice)
diff(Office,nlNormaloffice)
nloffice['log_Source_EUI'].describe()
nlNormaloffice['Normalized_log_Source_EUI'].describe()
ax = nloffice['log_Source_EUI'].plot(kind="hist", bins = 100, legend = True)
ax.set_xlabel("log_Source_EUI for Office")
ax.set_xlim([6.78, 8.53])
ax = nlNormaloffice['Normalized_log_Source_EUI'].plot(kind="hist", bins = 100, legend = True)
ax.set_xlabel("log_Normalized_Source_EUI for Office")
ax.set_xlim([6.78, 8.53])
nloffice.head()
nloffice.rename(columns={'ENERGY STAR Score': 'Escore'}, inplace=True)
nloffice.head()
nlNormaloffice.rename(columns={'ENERGY STAR Score': 'Escore'}, inplace=True)
nlNormaloffice.head()
office_EsVSlogEUI=smf.ols('Escore ~ log_Source_EUI', nloffice).fit()
office_EsVSlogEUI.summary()
office_EsVSNlogEUI=smf.ols('Escore ~ Normalized_log_Source_EUI',nlNormaloffice).fit()
office_EsVSNlogEUI.summary()
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(nloffice['log_Source_EUI'], nloffice['Escore'], 'o', label="data")
df1_ = nloffice.sort_values(by='log_Source_EUI')
ax.plot(df1_.log_Source_EUI, office_EsVSlogEUI.predict(df1_), color="Orange", label='OLS')
ax.legend(loc='best');
# ax.set_xlim(3 , 4.5)
ax.set_ylim(0 , 101)
ax.set_xlabel("Log_Source_EUI")
ax.set_ylabel("Energy Star Score")
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(nlNormaloffice['Normalized_log_Source_EUI'], nlNormaloffice['Escore'], 'o', label="data")
df1_ = nlNormaloffice.sort_values(by='Normalized_log_Source_EUI')
ax.plot(df1_.Normalized_log_Source_EUI, office_EsVSNlogEUI.predict(df1_), color="Orange", label='OLS')
ax.legend(loc='best');
# ax.set_xlim(3 , 4.5)
ax.set_ylim(0 , 101)
ax.set_xlabel("Log_Normalized_Source_EUI")
ax.set_ylabel("Energy Star Score")
#### Let's look it up for Occupancy with Log_Source_EUI and Normalized_Source_EUI
There is no NaN values
There is no NaN values
nlMultifamily.Occupancy.describe()
ax = nlMultifamily['Occupancy'].plot(kind="hist", bins =50, legend = True)
ax.set_xlabel("Multifamily Occupancy")
ax.set_xlim([60, 100])
#### I chose limit of Occupancy from 80 to 100 becouse of the histogram shape
OcMultifamily = nlMultifamily[nlMultifamily['Occupancy'] > 79]
diff(nlMultifamily,OcMultifamily)
MFeuiVSocc=smf.ols('log_Source_EUI ~ Occupancy', OcMultifamily).fit()
MFeuiVSocc.summary()
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(OcMultifamily['Occupancy'], OcMultifamily['log_Source_EUI'], 'o', label="data")
df1_ = OcMultifamily.sort_values(by='Occupancy')
ax.plot(df1_.Occupancy, MFeuiVSocc.predict(df1_), color="Orange", label='OLS')
ax.legend(loc='best');
# ax.set_xlim(3 , 4.5)
# ax.set_ylim(0 , 101)
ax.set_xlabel("Multifamily Occupancy")
ax.set_ylabel("Log_Source_EUI")
MFNeuiVSocc=smf.ols('Normalized_log_Source_EUI ~ Occupancy', OcMultifamily).fit()
MFNeuiVSocc.summary()
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(OcMultifamily['Occupancy'], OcMultifamily['Normalized_log_Source_EUI'], 'o', label="data")
df1_ = OcMultifamily.sort_values(by='Occupancy')
ax.plot(df1_.Occupancy, MFNeuiVSocc.predict(df1_), color="Orange", label='OLS')
ax.legend(loc='best');
# ax.set_xlim(3 , 4.5)
# ax.set_ylim(0 , 101)
ax.set_xlabel("Multifamily Occupancy")
ax.set_ylabel("Log_Normalized_Source_EUI")
#### For office
nloffice.head()
nlNormaloffice.head()
OeuiVSocc=smf.ols('Normalized_log_Source_EUI ~ Occupancy', nloffice).fit()
OeuiVSocc.summary()
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(nloffice['Occupancy'], nloffice['log_Source_EUI'], 'o', label="data")
df1_ = nloffice.sort_values(by='Occupancy')
ax.plot(df1_.Occupancy, OeuiVSocc.predict(df1_), color="Orange", label='OLS')
ax.legend(loc='best');
# ax.set_xlim(3 , 4.5)
# ax.set_ylim(0 , 101)
ax.set_xlabel("Office Occupancy")
ax.set_ylabel("Log_Source_EUI")
ONeuiVSocc=smf.ols('Normalized_log_Source_EUI ~ Occupancy', nlNormaloffice).fit()
ONeuiVSocc.summary()
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(nlNormaloffice['Occupancy'], nlNormaloffice['Normalized_log_Source_EUI'], 'o', label="data")
df1_ = nlNormaloffice.sort_values(by='Occupancy')
ax.plot(df1_.Occupancy, ONeuiVSocc.predict(df1_), color="Orange", label='OLS')
ax.legend(loc='best');
# ax.set_xlim(3 , 4.5)
# ax.set_ylim(0 , 101)
ax.set_xlabel("Office Occupancy")
ax.set_ylabel("Log Normalized Source EUI")
#### Let's Year variables with log_source_EUI for modeling
OcMultifamily['Year'] = (2017 - pd.to_numeric(OcMultifamily['Year Built'], downcast = 'integer'))
OcMultifamily = OcMultifamily[OcMultifamily['Year'] < OcMultifamily['Year'].quantile(0.95)]
nlNormaloffice['Year'] = (2017 - pd.to_numeric(nlNormaloffice['Year Built'], downcast = 'integer'))
nlNormaloffice = nlNormaloffice[nlNormaloffice['Year'] < nlNormaloffice['Year'].quantile(0.95)]
OcMultifamily.head()
MFeuiVSyear=smf.ols('log_Source_EUI ~ Year', OcMultifamily).fit()
MFeuiVSyear.summary()
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(OcMultifamily['Year'], OcMultifamily['log_Source_EUI'], 'o', label="data")
df1_ = OcMultifamily.sort_values(by='Year')
ax.plot(df1_.Year, MFeuiVSyear.predict(df1_), color="Orange", label='OLS')
ax.legend(loc='best');
# ax.set_xlim(3 , 4.5)
# ax.set_ylim(0 , 101)
ax.set_xlabel("House age in year")
ax.set_ylabel("Multifamily Log Source EUI")
### I think the number of building is affect the OLS. let's look up the histogram of years
ax = OcMultifamily['Year'].plot(kind="hist", bins =50, legend = True)
ax.set_xlabel("Year")
ax.set_xlim([0, 110])
#### 80 to 100 year building are high frequency so there are higher leverage on the OLS model
OffeuiVSyear=smf.ols('log_Source_EUI ~ Year', nlNormaloffice).fit()
OffeuiVSyear.summary()
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(nlNormaloffice['Year'], nlNormaloffice['log_Source_EUI'], 'o', label="data")
df1_ = nlNormaloffice.sort_values(by='Year')
ax.plot(df1_.Year, OffeuiVSyear.predict(df1_), color="Orange", label='OLS')
ax.legend(loc='best');
# ax.set_xlim(3 , 4.5)
# ax.set_ylim(0 , 101)
ax.set_xlabel("Office age in year")
ax.set_ylabel("Office Log Source EUI")
ax = nlNormaloffice['Year'].plot(kind="hist", bins =50, legend = True)
ax.set_xlabel("Office Year")
ax.set_xlim([0, 120])
BK = pd.read_csv('BORO/BK.csv')
BX = pd.read_csv('BORO/BX.csv')
MN = pd.read_csv('BORO/MN.csv')
QN = pd.read_csv('BORO/QN.csv')
SI = pd.read_csv('BORO/SI.csv')
OcMultifamily['BBL'] = pd.to_numeric(OcMultifamily['NYC Borough, Block and Lot (BBL)'], downcast='integer')
nlNormaloffice['BBL'] = pd.to_numeric(nloffice['NYC Borough, Block and Lot (BBL)'], downcast='integer')
PLUTO = pd.concat([BK, BX], ignore_index=True)
PLUTO = pd.concat([PLUTO, MN], ignore_index=True)
PLUTO = pd.concat([PLUTO, QN], ignore_index=True)
PLUTO = pd.concat([PLUTO, SI], ignore_index=True)
PLUTO.shape
PLUTO = pd.DataFrame({'NumFlors' : PLUTO['NumFloors'], 'BBL' : PLUTO['BBL']})
mMultifamily = OcMultifamily.merge(PLUTO, on='BBL', sort=False, suffixes=('_BM', '_PLUTO'), copy=True, indicator=False)
diff(OcMultifamily, mMultifamily)
moffice = nlNormaloffice.merge(PLUTO, on='BBL', sort=False, suffixes=('_BM', '_PLUTO'), copy=True, indicator=False)
diff(nlNormaloffice ,moffice)
#### Let's do PCA with,Occupancy, Year, Floor Area, Number of Floor and The predict value is Source EUI or Energy star score
#### All I need from PLUTO is Numer of floor because tall building would spend more energy to operate the building
mMultifamily.head()
mMultifamily.columns
dataX = mMultifamily.iloc[:,[4,11,18,20]]
dataY1 = mMultifamily.iloc[:,15] # Normalized Source EUI
dataY1 = dataY1.dropna()
dataY2 =mMultifamily.iloc[:,14] # Source_EUI
dataY2 = dataY2.dropna()
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
X1 = dataX
Y1 = dataY1
pd.DataFrame(X1).corr()
n=4 # how many eigenvectors we choose
from sklearn.decomposition import PCA
pca = PCA(n)
Xproj = pca.fit_transform(X1)
eigenvalues = pca.explained_variance_
print (pca.explained_variance_ratio_)
plt.bar(np.arange(n), eigenvalues);
plt.xlabel("Dimensionality")
plt.ylabel("Variance")
plt.show()
plt.figure(2, figsize=(8, 6))
plt.clf()
# Plot the training points
plt.scatter(Xproj[:, 0], Xproj[:, 1], c=Y1, cmap=plt.cm.cool)
plt.xlabel('First eigenvector')
plt.ylabel('Second eigenvector')
plt.show()
X2 = dataX
Y2 = dataY2
pd.DataFrame(X2).corr()
n=4 # how many eigenvectors we choose
from sklearn.decomposition import PCA
pca = PCA(n)
Xproj = pca.fit_transform(X2)
eigenvalues = pca.explained_variance_
print (pca.explained_variance_ratio_)
plt.bar(np.arange(n), eigenvalues);
plt.xlabel("Dimensionality")
plt.ylabel("Variance")
plt.show()
plt.figure(2, figsize=(8, 6))
plt.clf()
# Plot the training points
plt.scatter(Xproj[:, 0], Xproj[:, 1], c=Y2, cmap=plt.cm.cool)
plt.xlabel('First eigenvector')
plt.ylabel('Second eigenvector')
plt.show()
moffice.head()
dataX = moffice.iloc[:,[4,11,18,20]]
dataY1 = moffice.iloc[:,15] # Normalized Source EUI
dataY1 = dataY1.dropna()
dataY2 =moffice.iloc[:,14] # Source_EUI
dataY2 = dataY2.dropna()
X1 = dataX
Y1 = dataY1
pd.DataFrame(X1).corr()
n=4 # how many eigenvectors we choose
from sklearn.decomposition import PCA
pca = PCA(n)
Xproj = pca.fit_transform(X1)
eigenvalues = pca.explained_variance_
print (pca.explained_variance_ratio_)
plt.bar(np.arange(n), eigenvalues);
plt.xlabel("Dimensionality")
plt.ylabel("Variance")
plt.show()
plt.figure(2, figsize=(8, 6))
plt.clf()
# Plot the training points
plt.scatter(Xproj[:, 0], Xproj[:, 1], c=Y1, cmap=plt.cm.cool)
plt.xlabel('First eigenvector')
plt.ylabel('Second eigenvector')
plt.show()
X2 = dataX
Y2 = dataY2
pd.DataFrame(X2).corr()
n=4 # how many eigenvectors we choose
from sklearn.decomposition import PCA
pca = PCA(n)
Xproj = pca.fit_transform(X2)
eigenvalues = pca.explained_variance_
print (pca.explained_variance_ratio_)
plt.bar(np.arange(n), eigenvalues);
plt.xlabel("Dimensionality")
plt.ylabel("Variance")
plt.show()
plt.figure(2, figsize=(8, 6))
plt.clf()
# Plot the training points
plt.scatter(Xproj[:, 0], Xproj[:, 1], c=Y2, cmap=plt.cm.cool)
plt.xlabel('First eigenvector')
plt.ylabel('Second eigenvector')
plt.show()
#### There is out-liners let's try to clean it for futher research
#### What about multivariable modeling for Source_EUI and Normalized_Source_EUI
mMultifamily = mMultifamily.rename(columns={'DOF Property Floor Area (ft\xc2\xb2)': 'DOF_AREA'})
mMultifamily = mMultifamily.rename(columns={'Source_EUI(kBtu)': 'Source_EUI'})
mMultifamily.columns
seuiVSMV = smf.ols('Source_EUI ~ Occupancy + DOF_AREA + Year + NumFlors', mMultifamily).fit()
seuiVSMV.summary()
#### The R-squred value is 0.744, However, Occupancy P value is 0.879 so let's drop the Occupancy
seuiVSMV2 = smf.ols('Source_EUI ~ DOF_AREA + Year + NumFlors', mMultifamily).fit()
seuiVSMV2.summary()
#### The same R-squared value, 0.744 so Occupancy doesn't do with this model
moffice = moffice.rename(columns={'DOF Property Floor Area (ft\xc2\xb2)': 'DOF_AREA'})
moffice = moffice.rename(columns={'Source_EUI(kBtu)': 'Source_EUI'})
seuiVSMV3 = smf.ols('Source_EUI ~ Occupancy + DOF_AREA + Year + NumFlors', moffice).fit()
seuiVSMV3.summary()
#### The R-squared value is 0.833 but office's Occupancy P-value is bigger than 0.05 so Let's drop it
seuiVSMV4 = smf.ols('Source_EUI ~ DOF_AREA + Year + NumFlors', moffice).fit()
seuiVSMV4.summary()
#### R-squared value is 0.832 but all variable p-values are below 0.05
mMultifamily['Predict_Source_EUI'] = seuiVSMV2.predict()
moffice['Predict_Source_EUI'] = seuiVSMV4.predict()
#### Let's bring shape file and merge the zipcode with groupby median of Source_EUI
mMultifamily.head()
MNS = gp.read_file('nybb17/MNMapPLUTO.shp')
BXS = gp.read_file('nybb17/BXMapPLUTO.shp')
BKS = gp.read_file('nybb17/BKMapPLUTO.shp')
SIS = gp.read_file('nybb17/SIMapPLUTO.shp')
QNS = gp.read_file('nybb17/QNMapPLUTO.shp')
PLUTOS = MNS.append(BXS, ignore_index=True)
PLUTOS = PLUTOS.append(BKS, ignore_index=True)
PLUTOS = PLUTOS.append(SIS, ignore_index=True)
PLUTOS = PLUTOS.append(QNS, ignore_index=True)
# base.rename(columns = {'ZIPCODE':'Zip Code'}, inplace=True)
f, ax = plt.subplots(figsize=(20,20))
PLUTOS.plot(linewidth=0.5, edgecolor='black',ax = ax)
# base.columns
# base.sort_values(['Zip Code'])
# base = base.groupby(base['Zip Code']).first()
# base.head()
mMultifamily['NYC Borough, Block and Lot (BBL)'] = pd.to_numeric(mMultifamily['NYC Borough, Block and Lot (BBL)'] , downcast= 'integer')
moffice['NYC Borough, Block and Lot (BBL)'] = pd.to_numeric(moffice['NYC Borough, Block and Lot (BBL)'] , downcast= 'integer')
mMultifamily.head()
PLUTO1 = PLUTOS.merge(mMultifamily, on = 'BBL', how="inner")
PLUTO1 = gp.GeoDataFrame(PLUTO1)
PLUTO2 = PLUTOS.merge(moffice, on = 'BBL', how="inner")
f, ax = plt.subplots(figsize=(20,20))
vmin, vmax = min(PLUTO1['Source_EUI']) , max(PLUTO1['Source_EUI'])
f.suptitle('NYC Source EUI (kBtu)')
base = PLUTOS.plot(color='white', edgecolor='black', linewidth = 0.1 ,ax =ax)
PLUTO1.plot(column='Source_EUI', cmap='hot',vmin = vmin, vmax = vmax, linewidth=0.5, edgecolor='black',ax = ax)
# adding heat map
fig = ax.get_figure()
cax = fig.add_axes([0.9, 0.1, 0.03, 0.8])
sm = plt.cm.ScalarMappable(cmap='hot', norm=plt.Normalize(vmin=vmin, vmax=vmax))
# fake up the array of the scalar mappable...
sm._A = []
fig.colorbar(sm, cax=cax)
f, ax = plt.subplots(figsize=(20,20))
vmin, vmax = min(PLUTO1['Predict_Source_EUI']) , max(PLUTO1['Predict_Source_EUI'])
f.suptitle('NYC Predicted_Source_EUI(kBtu)')
base = PLUTOS.plot(color='white', edgecolor='black', linewidth = 0.1 ,ax =ax)
PLUTO1.plot(column='Predict_Source_EUI', cmap='hot',vmin = vmin, vmax = vmax, linewidth=0.5, edgecolor='black',ax = ax)
# adding heat map
fig = ax.get_figure()
cax = fig.add_axes([0.9, 0.1, 0.03, 0.8])
sm = plt.cm.ScalarMappable(cmap='hot', norm=plt.Normalize(vmin=vmin, vmax=vmax))
# fake up the array of the scalar mappable...
sm._A = []
fig.colorbar(sm, cax=cax)