# SAMPLE PYTHON PROGRAM FOR REGRESSION  March 2024

# pythonexample.py

# Includes OLS in statsmodels module and OLS and Random Forest in sklearn package

# You will need to change the filenames to your directory structure

# Standard check that python is working
print("hello")

# READ IN A STATA DATA SET (or .CSV FILE) USING pandas
# Use pandas - the following will shorten pandas in commands to pd 
import pandas as pd
# For files in directories use / and not \
# Using absolute paths is recommended. 
# To use working directories use os module. See the random forest example below. 
# This creates a data frame that I have called df  
df = pd.read_stata("c:/Users/ccameron/Dropbox/Desktop/Teaching/240f/python/carsdata.dta")

"""
Three " in a row at begin and end but then followed by a blank line comments out text
To read in a .csv file with comma separator and names in the firsat row
df = pd.read_csv("c:/Users/ccameron/Dropbox/Desktop/Teaching/240f/python/carsdata.csv")
"""

print("hello")

# SUMMARY STATISTICS IN pandas
# Following command ensures all columns are displayed
pd.set_option('display.max_columns', None)
# List first five observations
df.head()
# Summary statistics
df.describe()
# Mean and median of two selected variables
df[["cars","hhsize"]].apply(["mean","median"])

"""
# SOME PLOTS USING matplotlib.pyplot
# This is commented here as while it will work in stand-alone python
# the matplotlib package does not work if python is used within Stata
import numpy as np
import matplotlib.pyplot as plt
plt.hist(df.cars)
plt.show()
df.plot.box()
# In following x is given first, then y
plt.scatter(df.hhsize,df.cars)
"""

# LINEAR REGRESSION using module statsmodels command ols and a .csv dataset
# This requires data in a pandas dataframe
import pandas as pd
from statsmodels.formula.api import ols
# Backslash allows long line to wrap around 
# Here it needs to immediately follow Teaching/ 
# so that space is not added before 240f
cars = pd.read_csv('c:/Users/ccameron/Dropbox/Desktop/Teaching/\
240f/python/carsdata.csv')
simple = ols("CARS ~ HHSIZE", data = cars).fit()
print(simple.summary())
simple = ols("CARS ~ HHSIZE", data = cars).fit(cov_type = 'HC3')
print(simple.summary()) 

# OLS using module sklearn command Linearregression and a .csv dataset
# This requires data in a Numpy array rather than a pandas dataframe
# import modules
import os
from sklearn import linear_model
import pandas as pd
# Change working directory to directory with file carsdata.csv
os.getcwd()
os.chdir("c:/Users/ccameron/Dropbox/Desktop/Teaching/240f/python/")
os.getcwd()
# Read in data using pandas and create y and X
# Form pandas dataframe data from a CSV file
data = pd.read_csv("carsdata.csv")
# Following command ensures all columns are displayed
pd.set_option('display.max_columns', None)
data.describe()
# Separate features and target variable
# values forms the NumPy array underlying the Pandas dataframe named data
X = data.drop('CARS', axis=1).values
y = data['CARS'].values
# OLS
reg = linear_model.LinearRegression()
# Fit the model to the data
reg.fit(X, y)
reg.coef_
reg.intercept_

# RANDOM FOREST using defaults and a .csv dataset
# This requires data in a Numpy array rather than a pandas dataframe
# import modules
import os
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
# Change working directory to directory with file carsdata.csv
os.getcwd()
os.chdir("c:/Users/ccameron/Dropbox/Desktop/Teaching/240f/python/")
os.getcwd()
# Read in data using pandas and create y and X
# Form pandas dataframe data from a CSV file
data = pd.read_csv("carsdata.csv")
# Following command ensures all columns are displayed
pd.set_option('display.max_columns', None)
data.describe()
# Separate features and target variable
# values forms the NumPy array underlying the Pandas dataframe named data
X = data.drop('CARS', axis=1).values
y = data['CARS'].values
# Run random forest usig defauls 
rf = RandomForestRegressor()
# Fit the model to the data
rf.fit(X, y)
Rsquared = rf.score(X,y)
Rsquared
predictions = rf.predict(X) 
predictions
# Convert array to dataframe and write to file
dfpred = pd.DataFrame(predictions,columns=['rfrprediction'])
dfpred.describe()
data = dfpred.to_csv("predictions.csv")
data = dfpred.to_stata("predictions.dta")