# Python (version =3.6) for R Users Part 4: pandas
# CMU MSP 36601, Fall 2018, Howard Seltman

import numpy as np
import pandas as pd
import random
import math
import matplotlib.pyplot as plt

prices = pd.Series([12.34, 5.10, 18.60, 2.50],
                   index=['A54', 'C17', 'B23', 'M17'])

# d. Create a DataFrame using the class constructor
names = ["Pooya", "Ralph", "Jihae", "Ling"]
ages = [28, 31, 24, 22]
MSP = [True, False, False, True]

pd.DataFrame([names, ages, MSP])

dtf = pd.DataFrame(list(zip(names, ages, MSP)),
                   columns=["name", "age", "MSP"])
dtf
pd.DataFrame({'name': names, 'age': ages, 'MSP': MSP})
pd.DataFrame((names, ages, MSP))  # fails
type(dtf)

# e. Save as csv
fileLoc = r"data\fakeData.csv"  # raw string
dtf.to_csv(fileLoc, index=False, header=True)

# f. Read from csv
dtf2 = pd.read_csv(fileLoc)

# g. Check Series and DataFrame info
type(prices)
len(prices)
prices.index
prices.shape
prices.get_values()
prices.values
prices.dtype

type(dtf)
dtf.dtypes

dtf.get_values()

dtf.axes
dtf.ndim
dtf.size
dtf.shape
dtf.head(2)
dtf.tail(2)
dtf.index  # R's rownames()
dtf.index = [chr(ord('A')+i) for i in range(len(dtf))]
dtf

# h. Subsetting
prices[3]
prices['M17']
prices['C17':]
prices[:'C17']  # includes C17!!!
prices[1:3]  # does not include prices[3]
prices[prices>12]

dtf['age']
type(dtf['age'])
dtf['age']['C']  # or dtf['age'][2]
dtf[1]  # fails!!!
dtf[['MSP', 'name']]

dtf[1:3]  # slices rows; excludes row 3
dtf['B':'C']  # includes row 'C'

dtf[[True, False, True, False]]
dtf['age'].max()
dtf['age'] < dtf['age'].max()
dtf[dtf['age'] < dtf['age'].max()]

# note: parentheses required and must use "&" instead of "and"
dtf[(dtf['age'] > 22) & (dtf['age'] < 30)]
dtf.age[dtf['age'] <= 28]
dtf[(dtf['age'] <= 28) & dtf.MSP][['age', 'MSP']]
# Overloaded use of "~" complement operator in pandas:
dtf[['age', 'MSP']][~((dtf['age'] <= 28) & dtf.MSP)]

dtf.name[0:2]  # not robust!
dtf.loc['B':'C']  # based on index label and inclusive!
dtf.loc['B':'C', 'age':'MSP']
dtf.loc['B':'C', ['age', 'MSP']]
dtf.loc[['B', 'D'], 'age':'MSP']
dtf.loc[:, 'age':'MSP']
dtf.loc[[True, False, False, True]]
dtf.loc[dtf.age > 28]
dtf[dtf.age > 28][['name', 'MSP']]  # two steps
dtf.loc[dtf.age > 28, ['name', 'MSP']]  # one step

dtf.iloc[[0, 3]]
dtf.iloc[dtf.age > 28]  # fails
dtf.iloc[[0, 3], 0:2]  # exclusive like [0, 2)
dtf.iloc[dtf['age'] < 28, 0:2]  # ValueError
dtf.iloc[(dtf['age'] < 28).values, 0:2]
dtf.iloc[[1,3], ['MSP', 'age']]   # fails
dtf.columns.isin(['MSP', 'age'])  # [False,  True,  True])
dtf.iloc[[1,3], dtf.columns.isin(['MSP', 'age'])]


# Deprecated method: .ix[]
dtf.ix[[0, 3], ['MSP', 'age']]

# Other useful methods
dtf.drop("MSP", axis=1)  # axis=0 drops columns by index id
dtf.filter(regex="^[a-z]")  # add axis=0 to filter rownames

temp = pd.DataFrame([[np.nan, 2, 3], [4, 5, 6]])
temp
temp.dropna(axis=0)  # return rows with no nan's
temp.dropna(axis=1)
temp = temp.fillna(999)  # perhaps useful before an export
temp

# i. Logic testing
dtf.isnull()
dtf > 28
dtf.age > 28
(dtf.age == 31) | dtf.MSP  # parentheses required; "or" fails

# j. Descriptive statistics
prices.sum()  # ignores nan's
prices.count()  # ignores nan's
dtf.count()
dtf.count(axis=1)  # 0=rows (for each column) vs. 1 columns
dtf.mean()  # runs DataFrame.mean(axis=0)

dtf['age'].mean(skipna=False)  # runs Series.mean()

#  k. Plotting of Series or DataFrame objects
dtf['age'].plot.hist()
plt.show()  # In a new window
dtf['age'].plot.hist(bins=3)
plt.show()
dtf.age.plot.pie()
plt.show()
dtf.age.plot.box(title='Ages')
plt.show()
dtf.age.plot.kde()
plt.show()
dtf.boxplot('age', by='MSP')
plt.show()
dtf['weight'] = [130, 135, 125, 145]
dtf.plot.scatter('age', 'weight')
plt.ylabel('score')
plt.xlabel('time')
plt.title("Powers")
plt.savefig("power.png")
plt.show()


# l. Restructuring
wide = pd.DataFrame([["A", 3, 4, 5],
                     ["B", 6, 7, 8],
                     ["C", 9, 10, 11]],
                    columns=["id", "v1", "v2", "v3"])
wide
tall = pd.melt(wide, 'id')
tall
tall.pivot('id', 'variable', 'value')

# m. Adding new columns and rows
dtf['age2'] = dtf['age'] * 2
dtf['ratio'] = dtf.age2 / dtf.age
dtf
dtf.insert(2, 'score', [12, 15, 22, 11])
dtf

dtf.rename(columns={'age': 'Age'}, inplace=True)
dtf.rename(columns={'Age': 'age'}, inplace=True)

dtf['rx'] = [1, 2, 3, 1]
codes = {1: 'Placebo', 2: 'Drug A', 3: 'Drug B'}
dtf['rxc'] = dtf['rx'].map(codes.get)
dtf

# equivalents to R's cbind() and rbind()
D1 = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                  columns=['c1', 'c2', 'c3'])
D2 = pd.DataFrame([[11, 12, 13], [14, 15, 16], [17, 18, 19]],
                  columns=['c1', 'c2', 'c3'])
pd.concat([D1, D2])

pd.concat([D1, D2], axis=1)

# n. The groupby() method
N = 10
dat = pd.DataFrame({'id': ["S" + str(ii) for ii in range(N)],
                    'age': [22 + random.choice(range(3)) for
                            ii in range(N)],
                    'male': [random.random() < 0.5 for ii in
                             range(N)],
                    'score': [round(random.random(), 2) for
                              ii in range(N)]})

dat
gm = dat.groupby('male')
gm
len(gm)
gm.groups
for (male, grp) in gm:
        print('-- male --' if male else '-- female --')
        print(grp)
        print("mean score is", grp.score.mean())
        print()

gm.get_group(False)

gma = dat.groupby(['male', 'age'])
gma.groups
gma.get_group((False, 24))

gm.mean()
gm['age'].mean()
gm.max()
gma.mean()

gma.agg([np.sum, np.mean, np.std])

gm.transform(lambda x: (x-x.mean())/x.std())
pd.concat([dat, gm.transform(lambda x: (x-x.mean())/x.std())], axis=1)

# Note the following confusing issue:
np.std(dat.age)  # numpy default is ddof=0 (pop. sd)
np.std(dat.age, ddof=1)  # sample sd
dat.age.std()  # pandas changes the ddof default to 1


def z(obj):
    if obj.dtype in (np.float64, np.int64):
        return (obj - np.mean(obj)) / np.std(obj)
    return(obj)


print(dtf.apply(z))

# o. map() for Series and apply() for DataFrame
dtf['age'].map(math.log)
dtf['age'].map(lambda x: x**2)

dtf['name'].map(str.lower)
dtf[['age', 'score', 'age2', 'ratio']].apply(np.median)

# p. Convert integer columns to float so that DataFrames will play nice
#     with some other modules
dtf.dtypes
list(dtf.dtypes == 'int64')
dtf.loc[:, dtf.dtypes == 'int64'] = \
        dtf.loc[:, dtf.dtypes == 'int64'].astype('float64')
dtf