About this dataset: http://www.cru.uea.ac.uk/documents/421974/1295957/Info+sheet+%231.pdf/c612fc7e-babb-463c-b5e3-124ac76680c5
Monthly temperature readings in degree's celsius from baseline established by 50 year average. So, +1.2 means "1.2 degrees celsius above the baseline." Columns from several datasources, but we'll be focusing on three reading synthesized by the Climate Research Unit calculated back to 1850. They provide three data points: "NH" for "Northern Hemisphere", "SH" for "Southern Hemisphere", and a "GL" for "Global."
# usual set up
%pylab inline
rcParams['figure.figsize'] = 14,7
import pandas as pd
# load the data from the file
data = pd.read_csv('/Users/oranlooney/Dropbox/code/python/scipy/climate_change_data.csv')
# focus in on the data we're interested in today
data = data[['Year', 'Month', 'Date', 'CRU3 NH', 'CRU3 SH', 'CRU3 GL']]
data = data.rename(columns={'CRU3 NH':'North', 'CRU3 SH':'South', 'CRU3 GL':'Global'})
# sanity check: yep, they're numbers
data.head()
# statistical moments and ranges of each column
data.describe()
why mean and std are (sometimes) meaningless: Abscombe's Quartet
# start by simply plotting all available data to get an intuitive picture
data.plot(x='Date', y=['North', 'South', 'Global'])
# let's zoom in on the last data to get a close up picture of how the data varies.
last = max(data['Date'])
last_decade = data[ data['Date'] >= last-10][['Date', 'North', 'South', 'Global']]
last_decade.plot(x='Date')
linear = linspace(last-10, last, 100)
plot(linear, 0.3 + 0.3*cos(linear*2*pi), 'y--')
# I suspect seasonal variation, so I'm going to use a Fast Fourier Transform to check for periodicity in the data.
import statsmodels
from scipy import fftpack as fft
# Check the 'North' temperature data for seasonal variation
temps = data['North'].fillna(method='pad')
# rfft => real fast-fourier transform
weights = fft.rfft(temps)
frequencies = fft.rfftfreq(temps.size)
plot(frequencies, weights)
ylim([-10, 250])
annotate('annual', xy=(1.0/12.0, 80), xytext=(1/12.0, 100), arrowprops=dict(arrowstyle='->',))
# Check the global temperature data for seasonal variation
temps = data['Global'].fillna(method='pad')
# rfft => real fast-fourier transform
weights = fft.rfft(temps)
frequencies = fft.rfftfreq(temps.size)
plot(frequencies, weights)
ylim([-10, 250])
# fit a linear regression model to the 'Global' temperature data
import statsmodels.api as sm
X = sm.add_constant(data['Date'])
Y = data['Global'].fillna(method='pad')
linear_model = sm.OLS(Y, X)
results = linear_model.fit()
results.summary()
# let's graph the fit line to see if it makes sense
data.plot(x='Date', y=['Global'])
plot(data['Date'], data['Date'] * 0.0054 - 10.89, 'r-', linewidth=2, label='fit line')