My Python Cheatsheet for Econometrics

7 min readDec 13, 2020

These are my personal notes about python methods in Econometrics. I will constantly update them to confirm accuracy, clarity, and usefulness. Happy to receive any corrections or suggestions!

Matrix Operations
Latex Output
Statistics
Random Number and Simulation
Linear Regression
Plotting
Pandas
Data
Optimization
Time Series
Formatting

Matrix Operations

Matrix multiplication

import numpy as np 
# variance matrix
var = np.matmul(annual_v.T,annual_v)
# cov = var * correlation matrix
cov = np.matmul(var,corr)A @ Bport_var = np.dot(weights.T, np.dot(df_invest.cov(), weights))

Diagonal matrix

x = np.diag(np.arange(4))

Triangular matrix

a = np.triu(np.ones((3, 3)), 1)    
a.T  # transpose

Solving for system of linear equations

matrix = np.vstack([beta,smb,hml])
z = np.array([[-1]*3,[0]*3,[0]*3])
x, residuals, rank, s = np.linalg.lstsq(matrix,z)
# Least squares is a standard approach to problems with more equations than unknowns, also known as overdetermined systems
# x, residuals, rank, singular version of matrix(also absolute of eigenvalue)
# https://sodocumentation.net/numpy/topic/3753/linear-algebra-with-np-linalg

Latex Output

import array_to_latex as a2l 
a2l.to_ltx ()print(df.describe().to_latex())
print(df.corr().to_latex()) print(model.summary().as_latex()) 
print(sm.stats.anova_lm(model, typ=2).to_latex()) from tabulate import tabulate 
tabulate([['95%',abs(VaR_95*invest)]],headers=['Confidence level','Value at Risk'])

Statistics

Student T-test for unknown population standard deviation (sample n>30 or normally distributed). This is to check if the sample and the ‘hypothesis=0 sample’ come from the same distribution.

from scipy import stats alpha = 0.05
df = 5
t_stat = 1.96# calculate the one tail critical value, equivalent to Excel TINV(alpha,df)
cv = stats.t.ppf(1 - alpha, df)# calculate the two tail critical value
cv = stats.t.ppf(1 - (alpha/2), df)# calculate the p-value
p = (1 - stats.t.cdf(abs(t_stat), df)) * 2print((cv,p))

Two tail T-test for equal mean

from scipy import statprint(stats.ttest_ind(a=df1["PPY"],b=df2["PPY"],equal_var=False))

Output:

Ttest_indResult(statistic=0.2844374536719791, pvalue=0.7804443088685238)

Z test for known population standard deviation (sample n>30 or normally distributed)

# given area under normal to get the z-score
print(stats.norm.ppf(1900/2000,loc=1,scale=5/np.sqrt(2000)))# given z-score to return area under the curve
print(stats.norm.cdf(1,loc=1,scale=5/np.sqrt(2000)))

Confidence Interval

# CI = mu +- t * (standard error for each sample statistic is different) def sample_mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a),stats.sem(a)
    #h = stats.t.ppf((1+confidence)/2.,n-1)*se # two tail
    h = stats.t.ppf(confidence, n-1)*se # one tail
    return "{0:10.5f},{1:10.5f},{2:10.5f},{3:10.5f}".format(m, m-h, m+h, h*2)
print('for 95% confidence interval one tail t test - mean, lower, upper, width')
print(sample_mean_confidence_interval(df.VBTLX))

1st-4th Moments

from scipy import stats skew = df.resample('10Y').apply(lambda x:stats.skew(x))[:-1] 
kurt = df.resample('10Y').apply(lambda x:stats.kurtosis(x))[:-1]

Jarque-Bera JB test

JB = df.resample('10Y').apply(lambda x:stats.jarque_bera(x).pvalue)[:-1]

Sample statistics

# dfdf npnp - easy for searchprint(df['Simple Return'].mean())
print(df['Simple Return'].sem(axis=0))
print(df['Simple Return'].std(axis=0)/np.sqrt(len(df)))
print(df['Simple Return'].std(axis=0))
print(df['Simple Return'].autocorr(lag=1))
print(df['Simple Return'].autocorr(lag=2))
print(np.corrcoef(series1,series2)) # will return n by n matrix 
print(df.cumsum())
print(df.pct_change())
print(df.groupby(['column']))
print(df.column.str.contains(r'ha',na=True))
print(df.column.isin(['haha']))
print(df.unstack())
print(df.index)
print(df.info)
print(df.columns)
print(df.shape)
print(df.column.shift(-1))np.var
np.std
np.mean
np.median
np.log
np.sqrt
np.power(array1,3) 
np.hstack
np.vstack
np.ones
np.zeros
np.nonzero()
np.exp
np.sin
np.max
np.argmax() # index of maximum
np.any
np.all
np.cumsum
np.round
np.where(df.test>0,1,0)
np.array([]).tolist()
arr1 = np.append(arr1,arr2)

cointegration

from statsmodels.tsa.stattools import coint tscore,pvalue,significance = coint(df.CumsumG,df.CumsumS)

Random Number and Simulation

fixed step

np.arange(start=1,stop=25,step=1) pd.date_range(start='1/1/2020',periods=T,freq='D') np.linspace(port_returns.min(),port_returns.max(),150)

random normal

from random import seed
from random import random # seed to make rerun having same results
seed(123) 
np.random.normal(mu,sigma,size=(10,1))

random integer

np.random.randint(loc=0,scale=100,size=(13,1))

random from exponential distribution

np.random.exponential(scale=1,size=(128))

random from uniform distribution

np.random.uniform(low=0,high=1,size=(36))

random from dirichlet distribution

weights = np.random.dirichlet(np.ones(num_assets),size=1) # regarding interval transformation see my answer at
# https://stackoverflow.com/questions/63910689/transformed-dirichlet-array-with-range-1-1-in-numpy

Linear Regression

OLS on multiple dataframe

import statsmodels.api as smX = df2[['Excess_Market','HML','SMB','UMD']] 
y = df1['Excess Return']
X =  sm.add_constant(X)
m = sm.OLS(y,X) 
model = m.fit()

(Better) OLS on single dataframe with specified formula

m = sm.formula.ols(formula='Value~Time',data=df)
# default with adding a constant for intercept
model = m.fit() # result object

OLS Outputs

m.endog_names # print regressor or endog or dependent variables
m.exog_names # print regresee or exog or independent variables
m.df_model
m.df_resid
model.params # regression coefficients
model.params.Intercept
model.bse # standard error
model.tvalues
model.pvalues
model.rsquared
model.resid
model.summary()

Predictions

# in-sample prediction
model.predict(exog=X)# out-sample prediction
X = pd.DataFrame([61,62],columns=["Time"],index=pd.date_range(start="20201001",periods=2,freq='m'))
X = sm.add_constant(X) 
model.predict(exog=X)# non-linear prediction
x = df.Time
X = np.column_stack((x,x**2,x**3,x**4,x**5))
X = sm.add_constant(X)
m = sm.OLS(y,X)
model = m.fit()
Xnew = pd.DataFrame([[61,61**2,61**3,61**4,61**5],[62,62**2,62**3,62**4,62**5]],columns=["Time1","Time2","Time3","Time4","Time5"],index=pd.date_range(start="20201001",periods=2,freq='m'))
Xnew = sm.add_constant(Xnew)
model.predict(exog=Xnew))

polyfit in numpy

# linear polynomial (mean reverting nature)
s,b=np.polyfit(df.Time,df.Value,1) 
print(np.poly1d(np.polyfit(X,y,1)) )
plt.plot(df.Time,s*df.Time+b) # 5th power polynomial (trend following nature)
t,v,w,x,y,z = np.polyfit(df.Time,df.Value,5) 
print(np.poly1d(np.polyfit(X,y,5)))
plt.plot(df.Time,t*np.power(df.Time,5)+v*np.power(df.Time,4)+w*np.power(df.Time,3)+x*np.power(df.Time,2)+y*np.power(df.Time,1)+z)

polyfit in seaborn

sns.lmplot(x='Time',y='Value',data=df)

ANOVA

import statsmodels.api as sm # lm = linear model
# typ2 = type 2 sum of squared - most common
m = sm.formula.ols('conformity~C(fcategory, Sum)*C(partner_status, Sum)',data=data) 
model = m.fit()
print(sm.stats.anova_lm(model, typ=2)) # dataframe is returned

Plotting

matplotlib plot

import matplotlib.pyplot as plt
import matplotlib as mplplt.figure(figsize = (10,6))
# plt.title("test") 
# plt.xlabel('Portfolio Volatility')
# plt.ylabel('Portfolio Return')
# plt.colorbar(label='haha')
plt.plot(x,y,linestyle='o',color='purple',alpha=0.8,label='haha',linewidth=0.1,markersize=10,marker='x','r*')
plt.savefig('5.b.2.png') 
plt.show()

seaborn plot

import seaborn as snsplt.figure(figsize = (10,6))
sns.distplot(x, hist=False)
plt.savefig('5.b.2.png')
plt.show()

dataframe plot

fig = df.plot(kind='line',figsize=(10,6),title="Test",color='green',grid=True,label='test')
fig.get_figure().savefig('8.a.1.png') 
plt.show()

statsmodel api plot

import statsmodels.api as sm with mpl.rc_context():
    mpl.rc("figure", figsize=(10,6))
    sm.qqplot(x,line ='45')
    plt.savefig('5.b.1.png') 
plt.show()# regression plot
fig = plt.figure(figsize =(15,8))
fig = sm.graphics.plot_regress_exog(model,'ReturnS',fig=fig) 
plt.savefig('15.b.2.png')
plt.show()

histogram

plt.hist(y, bins=np.arange(start=y.min(),stop=y.max(),step=0.1),density=False) 
np.histogram(a=y,bins=np.arange(start=0,stop=10,step=0.01),density=True)

scatter plot

plt.plot(df.Time,df.Value) plt.scatter(a,b,colorbar=a/b,marker='o')

separate subplot

fig = plt.figure(figsize=(20,8)) plt.subplot(2,2,1)
df.hist(bins=30,ax=plt.gca()) 
# get current axes, if using plt to plot then no needplt.subplot(2,2,2)
df.hist(cumulative=True,bins=300,ax=plt.gca())plt.subplot(2,2,3)
df.boxplot(ax=plt.gca())plt.subplot(2,2,4)
sm.qqplot((df['CER']-np.mean(df['CER']))/np.std(df['CER']),line ='45',ax=plt.gca())fig.savefig('8.a.2.png')

overlapping subplot

fig, ax = plt.figure(figsize=(16,5))
ax1 = df.G.plot(color='green', grid=True, label='Gold')
ax2 = df.S.plot(color='purple', grid=True, secondary_y=True, label='Silver')
ax.grid(True)
ax.axhline(y=0, color='black', linestyle='-')  # horizontal line
h1, l1 = ax1.get_legend_handles_labels()
h2, l2 = ax2.get_legend_handles_labels()
plt.legend(h1+h2, l1+l2, loc=2)
plt.show()

Correlation matrix plot

from pandas.plotting import scatter_matrix
scatter_matrix(df,figsize=(10,10),alpha=0.5) sns.heatmap(df.corr(), annot=True, fmt='.1g',cmap=plt.cm.Blues)
plt.show()

Pandas Dataframe and wrangling

Create dataframe from dictionary

df=pd.DataFrame({'Month':n[],'PRR':[]})

Indexing and rename

df.reset_index(drop=True,inplace=True)
df.set_index('Date',inplace=True).sort_index()
df.index = pd.to_datetime(df.index) 
df1.columns = df1.columns.astype(str) 
df2.rename(columns={'Mkt-RF': 'Excess_Market'}, inplace=True)

Drop column

df.drop('Adj Close',axis=1,inplace=True) del df2['Net SharesChanged']

Sort

df2.sort_values('Value',ascending=False, inplace=True) .drop_duplicates()sorted(std_dict.items(),key=lambda x: x[1],reverse=True)[:5]

NA NAN Null

df = df.fillna(method='ffill')
df = df.fillna(method='bfill')
df = df.dropna(subset=['Sector','industry'],axis=1,how='all') #how='any'
df = df.isnull()df2.replace(to_replace='None', value=np.nan, inplace=True, regex=False)

Merge

df = pd.merge(pd.merge(df_a,df_b,on='caldt'),df_c_df,on='date',how='inner') #left_on,right_on
df.columns = ['df_a','df_b','df_c'] # multiple merge
df = reduce(lambda left,right: pd.merge(left,right,on=['Date'],how='outer'), countries)
df.columns = ['Date','CA','FR','GE','JP','UK','US','AU','HK']

Data

yahoo api

from pandas_datareader import data as pdr
import yfinance as yfyf.pdr_override() 
# yahoo can get minute data for last 30 days, max 7 day period.
indexes = '^GSPTSE ^FCHI ^GDAXI ^N225 ^FTSE ^GSPC ^AORD ^HSI'
indexes = indexes.split() # this must be a list
data = pdr.get_data_yahoo(indexes, start="2000-09-01", end="2020-12-01", interval='1mo')["Adj Close"]
gold = pd.DataFrame(data=data)

WRDS api (Wharton business school)

import wrds
db = wrds.Connection(wrds_user='')
db.create_pgpass_file()
db.list_libraries()
db.list_tables(library='crsp')# get fund numbers first
fund_df = db.get_table(library='crsp', table='fund_names',obs=2)
fund_df = fund_df.loc[fund_df['ticker'].isin(['VBTLX','VEMAX','VSMAX']),:]
fund_df.drop_duplicates(subset=['crsp_fundno'],keep='last',inplace=True)# find the monthly returns
month_df = db.get_table(library='crsp', table='monthly_returns')
VBTLX_df = month_df.loc[month_df['crsp_fundno']==31244]FB = db.raw_sql("select date,ret from crsp.dsf where (cusip='30303M10') and date>='2018-01-01' and date<='2019-12-31'")

FMP api

from fmp_python.fmp import FMP
import FundamentalAnalysis as faticker = "AAPL,TSLA"
api_key = ""
fa.profile(ticker, api_key)

Morningstar api

import morningstar

Beautiful soup web scraping

import requests
from bs4 import BeautifulSoupURL = 'https://t#'
page = requests.get(URL)
print(page.status_code)   # This should print 200
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='page-wrapper')
print(results.prettify())
table = soup.find('table', class_='table table-striped table-hover table-sm')
df = pd.read_html(str(table))[0]

URL and zip

import urllib.request
import zipfileff_url = "http://"
urllib.request.urlretrieve(ff_url,'fama_french.zip')zip_file = zipfile.ZipFile('fama_french.zip', 'r')
zip_file.extractall()
zip_file.close()

VAR (Value at Risk)

def calculate_var(fund,confidence):
    print(fund,confidence)
    invest = 100000
    # use z score instead of t to calculate VAR since the sample size is big
    z = abs(stats.norm.ppf((1-confidence)/2,0,1))
    se = df[fund].sem()
    std = df[fund].std()
    mean = df[fund].mean()
    relative_var = invest*z*std
    abs_var = invest*(z*std+mean)
    return "relative var = {:.0f}  absolute var = {:.0f}".format(relative_var,abs_var)def simulated_var(fund,confidence):
    invest = 100000
    VaR_95 = df[fund].quantile(1-confidence)
    return tabulate([['95%',abs(VaR_95*invest)]],headers=['Confidence level','Value at Risk'])

data input

xls = pd.ExcelFile(r'C:\Users\AQR.xlsx')
df1 = pd.read_excel(xls, 'Return',index_col=0, header=2)df = pd.read_csv('F-F_Research_Data_Factors.csv', skiprows = 3, index_col = 0)

data output

from openpyxl import load_workbookout_path = "C:\\Users\\python.xlsx"
writer = pd.ExcelWriter(out_path, engine='openpyxl')# try to open an existing workbook
writer.book = load_workbook(out_path)# copy existing sheets
writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)# read existing file
reader = pd.read_excel(out_path)# write out the new sheet
df2.to_excel(writer,index=False,header=False,startrow=len(reader)+1)writer.close()

Asset Return

if 'Simple Return' not in df: # so rerun will work
    df['Simple Return']=np.log(df['Adj Close']).diff() # diff with the previous row
    df['Simple Return']=df['Adj Close'].diff()
df = df[1:]

import smtplibgmail_user = '@gmail.com'
gmail_password = ''
sent_from = gmail_user
to = ['', 'test@hotmail.com']
subject = 'haha strategy'
body = 'buy ticker '+str(df2.iloc[0:5,1])
email_text = """\
From: %s
To: %s
Subject: %s
%s
""" % (sent_from, ", ".join(to), subject, body)
try:
    server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
    server.ehlo()
    server.login(gmail_user, gmail_password)
    server.sendmail(sent_from, to, email_text)
    server.close()
    print('Email sent!')
except Exception as e:
    print(str(e))

Rolling average

df_month = df.resample('10Y').sum()a=np.cumsum(df1['Decile Return'],axis=0)[1::10]
average_yearly_return_for_each_decade = (np.array(a[1:]) - np.array(a[:-1]))/10

Optimization

Portfolio construction

import scipy.optimize as optimizedef minimize_sharpe(weights):  
    return -portfolio_stats(weights)['sharpe'] # assets add up to 1 and return being target return
constraints = ({'type':'eq','fun': lambda x: portfolio_stats(x)['return']-target_return},{'type':'eq','fun': lambda x: np.sum(x)-1})
# one asset can take up 0 - 100% position
bounds = tuple((0,1) for x in range(len(countries)))
# start from equally weighted
initializer = len(countries) * [1./len(countries),]optimal_sharpe=optimize.minimize(minimize_sharpe,
                                 initializer,
                                 method = 'SLSQP',
                                 bounds = bounds,
                                 constraints = constraints)# get the output
optimal_sharpe_weights=optimal_sharpe['x'].round(4) # x is our output
optimal_sharpe['fun']

optimize system of equations

def objective(x):
    x1=x[0]
    x2=x[1]
    x3=x[2]
    x4=x[3]
    return -1.50440748*x1-1.35598889*x2-0.50999618*x3+0.77768992*x4def constraint1(x):
    return -x[0]-x[1]-x[2]-x[3]initializer = np.ones([4,1])
bounds = tuple((-1,1) for i in range(0,4))
con1 = {'type':'eq','fun':constraint1}
cons = [con1]sol = minimize(objective,initializer,method='SLSQP',bounds=bounds,constraints=[])

Time Series

datetime

import datetime as dtt = pd.date_range(start='1/1/2020',periods=T,freq='D')
df = pd.DataFrame(data=r,index=t,columns=['CER'])
df.index = pd.to_datetime(df.index,format= '%Y%m') 
dt.datetime.now()-dt.timedelta(days=90)

acf

print(df['Simple Return'].autocorr(lag=1))
print(df['Simple Return'].autocorr(lag=2))
print(df['Simple Return'].autocorr(lag=3))from statsmodels.graphics.tsaplots import plot_acfwith mpl.rc_context():
    mpl.rc("figure", figsize=(10,6))
    plot_acf(df['Simple Return'],lags=5,title='First 5 lags')
    plt.savefig('9.a.3.png')

Formatting

print('haah %s and %s is %.4f' % ('Gold','Silver',pvalue))#pd.set_option('display.max_rows', 10)
#pd.set_option('display.max_columns', None)
#pd.set_option('max_colwidth',100)
#pd.set_option('display.float_format', lambda x: '%10.3f' % x)
#pd.reset_option('display.float_format')
#np.set_printoptions(precision=4, suppress=True)
%precision %10.4f # non scientific
#warnings.filterwarnings('ignore') # not recommand to turn off%matplotlib inline
%config InlineBackend.figure_format = 'retina' print(os.getcwd())
print(pd.__version__)
print(sys.version)