My Python Cheatsheet for Econometrics

Charlie
7 min readDec 13, 2020
source: unsplash

These are my personal notes about python methods in Econometrics. I will constantly update them to confirm accuracy, clarity, and usefulness. Happy to receive any corrections or suggestions!

  • Matrix Operations
  • Latex Output
  • Statistics
  • Random Number and Simulation
  • Linear Regression
  • Plotting
  • Pandas
  • Data
  • Optimization
  • Time Series
  • Formatting

Matrix Operations

Matrix multiplication

import numpy as np 
# variance matrix
var = np.matmul(annual_v.T,annual_v)
# cov = var * correlation matrix
cov = np.matmul(var,corr)
A @ Bport_var = np.dot(weights.T, np.dot(df_invest.cov(), weights))

Diagonal matrix

x = np.diag(np.arange(4))

Triangular matrix

a = np.triu(np.ones((3, 3)), 1)    
a.T # transpose

Solving for system of linear equations

matrix = np.vstack([beta,smb,hml])
z = np.array([[-1]*3,[0]*3,[0]*3])
x, residuals, rank, s = np.linalg.lstsq(matrix,z)
# Least squares is a standard approach to problems with more equations than unknowns, also known as overdetermined systems
# x, residuals, rank, singular version of matrix(also absolute of eigenvalue)
# https://sodocumentation.net/numpy/topic/3753/linear-algebra-with-np-linalg

Latex Output

import array_to_latex as a2l 
a2l.to_ltx ()
print(df.describe().to_latex())
print(df.corr().to_latex())
print(model.summary().as_latex())
print(sm.stats.anova_lm(model, typ=2).to_latex())
from tabulate import tabulate
tabulate([['95%',abs(VaR_95*invest)]],headers=['Confidence level','Value at Risk'])

Statistics

Student T-test for unknown population standard deviation (sample n>30 or normally distributed). This is to check if the sample and the ‘hypothesis=0 sample’ come from the same distribution.

from scipy import stats alpha = 0.05
df = 5
t_stat = 1.96
# calculate the one tail critical value, equivalent to Excel TINV(alpha,df)
cv = stats.t.ppf(1 - alpha, df)
# calculate the two tail critical value
cv = stats.t.ppf(1 - (alpha/2), df)
# calculate the p-value
p = (1 - stats.t.cdf(abs(t_stat), df)) * 2
print((cv,p))

Two tail T-test for equal mean

from scipy import statprint(stats.ttest_ind(a=df1["PPY"],b=df2["PPY"],equal_var=False))

Output:

Ttest_indResult(statistic=0.2844374536719791, pvalue=0.7804443088685238)

Z test for known population standard deviation (sample n>30 or normally distributed)

# given area under normal to get the z-score
print(stats.norm.ppf(1900/2000,loc=1,scale=5/np.sqrt(2000)))
# given z-score to return area under the curve
print(stats.norm.cdf(1,loc=1,scale=5/np.sqrt(2000)))

Confidence Interval

# CI = mu +- t * (standard error for each sample statistic is different) def sample_mean_confidence_interval(data, confidence=0.95):
a = 1.0 * np.array(data)
n = len(a)
m, se = np.mean(a),stats.sem(a)
#h = stats.t.ppf((1+confidence)/2.,n-1)*se # two tail
h = stats.t.ppf(confidence, n-1)*se # one tail
return "{0:10.5f},{1:10.5f},{2:10.5f},{3:10.5f}".format(m, m-h, m+h, h*2)
print('for 95% confidence interval one tail t test - mean, lower, upper, width')
print(sample_mean_confidence_interval(df.VBTLX))

1st-4th Moments

from scipy import stats skew = df.resample('10Y').apply(lambda x:stats.skew(x))[:-1] 
kurt = df.resample('10Y').apply(lambda x:stats.kurtosis(x))[:-1]

Jarque-Bera JB test

JB = df.resample('10Y').apply(lambda x:stats.jarque_bera(x).pvalue)[:-1]

Sample statistics

# dfdf npnp - easy for searchprint(df['Simple Return'].mean())
print(df['Simple Return'].sem(axis=0))
print(df['Simple Return'].std(axis=0)/np.sqrt(len(df)))
print(df['Simple Return'].std(axis=0))
print(df['Simple Return'].autocorr(lag=1))
print(df['Simple Return'].autocorr(lag=2))
print(np.corrcoef(series1,series2)) # will return n by n matrix
print(df.cumsum())
print(df.pct_change())
print(df.groupby(['column']))
print(df.column.str.contains(r'ha',na=True))
print(df.column.isin(['haha']))
print(df.unstack())
print(df.index)
print(df.info)
print(df.columns)
print(df.shape)
print(df.column.shift(-1))
np.var
np.std
np.mean
np.median
np.log
np.sqrt
np.power(array1,3)
np.hstack
np.vstack
np.ones
np.zeros
np.nonzero()
np.exp
np.sin
np.max
np.argmax() # index of maximum
np.any
np.all
np.cumsum
np.round
np.where(df.test>0,1,0)
np.array([]).tolist()
arr1 = np.append(arr1,arr2)

cointegration

from statsmodels.tsa.stattools import coint tscore,pvalue,significance = coint(df.CumsumG,df.CumsumS)

Random Number and Simulation

fixed step

np.arange(start=1,stop=25,step=1) pd.date_range(start='1/1/2020',periods=T,freq='D') np.linspace(port_returns.min(),port_returns.max(),150)

random normal

from random import seed
from random import random
# seed to make rerun having same results
seed(123)
np.random.normal(mu,sigma,size=(10,1))

random integer

np.random.randint(loc=0,scale=100,size=(13,1))

random from exponential distribution

np.random.exponential(scale=1,size=(128))

random from uniform distribution

np.random.uniform(low=0,high=1,size=(36))

random from dirichlet distribution

weights = np.random.dirichlet(np.ones(num_assets),size=1) # regarding interval transformation see my answer at
# https://stackoverflow.com/questions/63910689/transformed-dirichlet-array-with-range-1-1-in-numpy

Linear Regression

OLS on multiple dataframe

import statsmodels.api as smX = df2[['Excess_Market','HML','SMB','UMD']] 
y = df1['Excess Return']
X = sm.add_constant(X)
m = sm.OLS(y,X)
model = m.fit()

(Better) OLS on single dataframe with specified formula

m = sm.formula.ols(formula='Value~Time',data=df)
# default with adding a constant for intercept
model = m.fit() # result object

OLS Outputs

m.endog_names # print regressor or endog or dependent variables
m.exog_names # print regresee or exog or independent variables
m.df_model
m.df_resid
model.params # regression coefficients
model.params.Intercept
model.bse # standard error
model.tvalues
model.pvalues
model.rsquared
model.resid
model.summary()

Predictions

# in-sample prediction
model.predict(exog=X)
# out-sample prediction
X = pd.DataFrame([61,62],columns=["Time"],index=pd.date_range(start="20201001",periods=2,freq='m'))
X = sm.add_constant(X)
model.predict(exog=X)
# non-linear prediction
x = df.Time
X = np.column_stack((x,x**2,x**3,x**4,x**5))
X = sm.add_constant(X)
m = sm.OLS(y,X)
model = m.fit()
Xnew = pd.DataFrame([[61,61**2,61**3,61**4,61**5],[62,62**2,62**3,62**4,62**5]],columns=["Time1","Time2","Time3","Time4","Time5"],index=pd.date_range(start="20201001",periods=2,freq='m'))
Xnew = sm.add_constant(Xnew)
model.predict(exog=Xnew))

polyfit in numpy

# linear polynomial (mean reverting nature)
s,b=np.polyfit(df.Time,df.Value,1)
print(np.poly1d(np.polyfit(X,y,1)) )
plt.plot(df.Time,s*df.Time+b)
# 5th power polynomial (trend following nature)
t,v,w,x,y,z = np.polyfit(df.Time,df.Value,5)
print(np.poly1d(np.polyfit(X,y,5)))
plt.plot(df.Time,t*np.power(df.Time,5)+v*np.power(df.Time,4)+w*np.power(df.Time,3)+x*np.power(df.Time,2)+y*np.power(df.Time,1)+z)

polyfit in seaborn

sns.lmplot(x='Time',y='Value',data=df)

ANOVA

import statsmodels.api as sm # lm = linear model
# typ2 = type 2 sum of squared - most common
m = sm.formula.ols('conformity~C(fcategory, Sum)*C(partner_status, Sum)',data=data)
model = m.fit()
print(sm.stats.anova_lm(model, typ=2)) # dataframe is returned

Plotting

matplotlib plot

import matplotlib.pyplot as plt
import matplotlib as mpl
plt.figure(figsize = (10,6))
# plt.title("test")
# plt.xlabel('Portfolio Volatility')
# plt.ylabel('Portfolio Return')
# plt.colorbar(label='haha')
plt.plot(x,y,linestyle='o',color='purple',alpha=0.8,label='haha',linewidth=0.1,markersize=10,marker='x','r*')
plt.savefig('5.b.2.png')
plt.show()

seaborn plot

import seaborn as snsplt.figure(figsize = (10,6))
sns.distplot(x, hist=False)
plt.savefig('5.b.2.png')
plt.show()

dataframe plot

fig = df.plot(kind='line',figsize=(10,6),title="Test",color='green',grid=True,label='test')
fig.get_figure().savefig('8.a.1.png')
plt.show()

statsmodel api plot

import statsmodels.api as sm with mpl.rc_context():
mpl.rc("figure", figsize=(10,6))
sm.qqplot(x,line ='45')
plt.savefig('5.b.1.png')
plt.show()
# regression plot
fig = plt.figure(figsize =(15,8))
fig = sm.graphics.plot_regress_exog(model,'ReturnS',fig=fig)
plt.savefig('15.b.2.png')
plt.show()

histogram

plt.hist(y, bins=np.arange(start=y.min(),stop=y.max(),step=0.1),density=False) 
np.histogram(a=y,bins=np.arange(start=0,stop=10,step=0.01),density=True)

scatter plot

plt.plot(df.Time,df.Value) plt.scatter(a,b,colorbar=a/b,marker='o')

separate subplot

fig = plt.figure(figsize=(20,8)) plt.subplot(2,2,1)
df.hist(bins=30,ax=plt.gca())
# get current axes, if using plt to plot then no need
plt.subplot(2,2,2)
df.hist(cumulative=True,bins=300,ax=plt.gca())
plt.subplot(2,2,3)
df.boxplot(ax=plt.gca())
plt.subplot(2,2,4)
sm.qqplot((df['CER']-np.mean(df['CER']))/np.std(df['CER']),line ='45',ax=plt.gca())
fig.savefig('8.a.2.png')

overlapping subplot

fig, ax = plt.figure(figsize=(16,5))
ax1 = df.G.plot(color='green', grid=True, label='Gold')
ax2 = df.S.plot(color='purple', grid=True, secondary_y=True, label='Silver')
ax.grid(True)
ax.axhline(y=0, color='black', linestyle='-') # horizontal line
h1, l1 = ax1.get_legend_handles_labels()
h2, l2 = ax2.get_legend_handles_labels()
plt.legend(h1+h2, l1+l2, loc=2)
plt.show()

Correlation matrix plot

from pandas.plotting import scatter_matrix
scatter_matrix(df,figsize=(10,10),alpha=0.5)
sns.heatmap(df.corr(), annot=True, fmt='.1g',cmap=plt.cm.Blues)
plt.show()

Pandas Dataframe and wrangling

Create dataframe from dictionary

df=pd.DataFrame({'Month':n[],'PRR':[]})

Indexing and rename

df.reset_index(drop=True,inplace=True)
df.set_index('Date',inplace=True).sort_index()
df.index = pd.to_datetime(df.index)
df1.columns = df1.columns.astype(str)
df2.rename(columns={'Mkt-RF': 'Excess_Market'}, inplace=True)

Drop column

df.drop('Adj Close',axis=1,inplace=True) del df2['Net SharesChanged']

Sort

df2.sort_values('Value',ascending=False, inplace=True) .drop_duplicates()sorted(std_dict.items(),key=lambda x: x[1],reverse=True)[:5]

NA NAN Null

df = df.fillna(method='ffill')
df = df.fillna(method='bfill')
df = df.dropna(subset=['Sector','industry'],axis=1,how='all') #how='any'
df = df.isnull()
df2.replace(to_replace='None', value=np.nan, inplace=True, regex=False)

Merge

df = pd.merge(pd.merge(df_a,df_b,on='caldt'),df_c_df,on='date',how='inner') #left_on,right_on
df.columns = ['df_a','df_b','df_c']
# multiple merge
df = reduce(lambda left,right: pd.merge(left,right,on=['Date'],how='outer'), countries)
df.columns = ['Date','CA','FR','GE','JP','UK','US','AU','HK']

Data

yahoo api

from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override()
# yahoo can get minute data for last 30 days, max 7 day period.
indexes = '^GSPTSE ^FCHI ^GDAXI ^N225 ^FTSE ^GSPC ^AORD ^HSI'
indexes = indexes.split() # this must be a list
data = pdr.get_data_yahoo(indexes, start="2000-09-01", end="2020-12-01", interval='1mo')["Adj Close"]
gold = pd.DataFrame(data=data)

WRDS api (Wharton business school)

import wrds
db = wrds.Connection(wrds_user='')
db.create_pgpass_file()
db.list_libraries()
db.list_tables(library='crsp')
# get fund numbers first
fund_df = db.get_table(library='crsp', table='fund_names',obs=2)
fund_df = fund_df.loc[fund_df['ticker'].isin(['VBTLX','VEMAX','VSMAX']),:]
fund_df.drop_duplicates(subset=['crsp_fundno'],keep='last',inplace=True)
# find the monthly returns
month_df = db.get_table(library='crsp', table='monthly_returns')
VBTLX_df = month_df.loc[month_df['crsp_fundno']==31244]
FB = db.raw_sql("select date,ret from crsp.dsf where (cusip='30303M10') and date>='2018-01-01' and date<='2019-12-31'")

FMP api

from fmp_python.fmp import FMP
import FundamentalAnalysis as fa
ticker = "AAPL,TSLA"
api_key = ""
fa.profile(ticker, api_key)

Morningstar api

import morningstar

Beautiful soup web scraping

import requests
from bs4 import BeautifulSoup
URL = 'https://t#'
page = requests.get(URL)
print(page.status_code) # This should print 200
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='page-wrapper')
print(results.prettify())
table = soup.find('table', class_='table table-striped table-hover table-sm')
df = pd.read_html(str(table))[0]

URL and zip

import urllib.request
import zipfile
ff_url = "http://"
urllib.request.urlretrieve(ff_url,'fama_french.zip')
zip_file = zipfile.ZipFile('fama_french.zip', 'r')
zip_file.extractall()
zip_file.close()

VAR (Value at Risk)

def calculate_var(fund,confidence):
print(fund,confidence)
invest = 100000
# use z score instead of t to calculate VAR since the sample size is big
z = abs(stats.norm.ppf((1-confidence)/2,0,1))
se = df[fund].sem()
std = df[fund].std()
mean = df[fund].mean()
relative_var = invest*z*std
abs_var = invest*(z*std+mean)
return "relative var = {:.0f} absolute var = {:.0f}".format(relative_var,abs_var)
def simulated_var(fund,confidence):
invest = 100000
VaR_95 = df[fund].quantile(1-confidence)
return tabulate([['95%',abs(VaR_95*invest)]],headers=['Confidence level','Value at Risk'])

data input

xls = pd.ExcelFile(r'C:\Users\AQR.xlsx')
df1 = pd.read_excel(xls, 'Return',index_col=0, header=2)
df = pd.read_csv('F-F_Research_Data_Factors.csv', skiprows = 3, index_col = 0)

data output

from openpyxl import load_workbookout_path = "C:\\Users\\python.xlsx"
writer = pd.ExcelWriter(out_path, engine='openpyxl')
# try to open an existing workbook
writer.book = load_workbook(out_path)
# copy existing sheets
writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
# read existing file
reader = pd.read_excel(out_path)
# write out the new sheet
df2.to_excel(writer,index=False,header=False,startrow=len(reader)+1)
writer.close()

Asset Return

if 'Simple Return' not in df: # so rerun will work
df['Simple Return']=np.log(df['Adj Close']).diff() # diff with the previous row
df['Simple Return']=df['Adj Close'].diff()
df = df[1:]

Email

import smtplibgmail_user = '@gmail.com'
gmail_password = ''
sent_from = gmail_user
to = ['', 'test@hotmail.com']
subject = 'haha strategy'
body = 'buy ticker '+str(df2.iloc[0:5,1])
email_text = """\
From: %s
To: %s
Subject: %s
%s
""" % (sent_from, ", ".join(to), subject, body)
try:
server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
server.ehlo()
server.login(gmail_user, gmail_password)
server.sendmail(sent_from, to, email_text)
server.close()
print('Email sent!')
except Exception as e:
print(str(e))

Rolling average

df_month = df.resample('10Y').sum()a=np.cumsum(df1['Decile Return'],axis=0)[1::10]
average_yearly_return_for_each_decade = (np.array(a[1:]) - np.array(a[:-1]))/10

Optimization

Portfolio construction

import scipy.optimize as optimizedef minimize_sharpe(weights):  
return -portfolio_stats(weights)['sharpe']
# assets add up to 1 and return being target return
constraints = ({'type':'eq','fun': lambda x: portfolio_stats(x)['return']-target_return},{'type':'eq','fun': lambda x: np.sum(x)-1})
# one asset can take up 0 - 100% position
bounds = tuple((0,1) for x in range(len(countries)))
# start from equally weighted
initializer = len(countries) * [1./len(countries),]
optimal_sharpe=optimize.minimize(minimize_sharpe,
initializer,
method = 'SLSQP',
bounds = bounds,
constraints = constraints)
# get the output
optimal_sharpe_weights=optimal_sharpe['x'].round(4) # x is our output
optimal_sharpe['fun']

optimize system of equations

def objective(x):
x1=x[0]
x2=x[1]
x3=x[2]
x4=x[3]
return -1.50440748*x1-1.35598889*x2-0.50999618*x3+0.77768992*x4
def constraint1(x):
return -x[0]-x[1]-x[2]-x[3]
initializer = np.ones([4,1])
bounds = tuple((-1,1) for i in range(0,4))
con1 = {'type':'eq','fun':constraint1}
cons = [con1]
sol = minimize(objective,initializer,method='SLSQP',bounds=bounds,constraints=[])

Time Series

datetime

import datetime as dtt = pd.date_range(start='1/1/2020',periods=T,freq='D')
df = pd.DataFrame(data=r,index=t,columns=['CER'])
df.index = pd.to_datetime(df.index,format= '%Y%m')
dt.datetime.now()-dt.timedelta(days=90)

acf

print(df['Simple Return'].autocorr(lag=1))
print(df['Simple Return'].autocorr(lag=2))
print(df['Simple Return'].autocorr(lag=3))
from statsmodels.graphics.tsaplots import plot_acfwith mpl.rc_context():
mpl.rc("figure", figsize=(10,6))
plot_acf(df['Simple Return'],lags=5,title='First 5 lags')
plt.savefig('9.a.3.png')

Formatting

print('haah %s and %s is %.4f' % ('Gold','Silver',pvalue))#pd.set_option('display.max_rows', 10)
#pd.set_option('display.max_columns', None)
#pd.set_option('max_colwidth',100)
#pd.set_option('display.float_format', lambda x: '%10.3f' % x)
#pd.reset_option('display.float_format')
#np.set_printoptions(precision=4, suppress=True)
%precision %10.4f # non scientific
#warnings.filterwarnings('ignore') # not recommand to turn off
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
print(os.getcwd())
print(pd.__version__)
print(sys.version)

Sign up to discover human stories that deepen your understanding of the world.

Free

Distraction-free reading. No ads.

Organize your knowledge with lists and highlights.

Tell your story. Find your audience.

Membership

Read member-only stories

Support writers you read most

Earn money for your writing

Listen to audio narrations

Read offline with the Medium app

No responses yet

Write a response