반응형
캐글에 제출했더니 0.79가 나왔다.
데이터 전처리가 부족한거 같다. 알고리즘은 다 써봤는데;;
Name 변수를 좀 더 이용하면 좋을거 같다.
그래도 1000등안에 들어서 만족스럽긴 하다. (아직 입문단계 ㅠㅠ)
In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style('dark')
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
font_path = "c:/Windows/Fonts/malgun.ttf"
font_name = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family = font_name)
titanic_train = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')
In [2]:
titanic_train.head()
Out[2]:
In [3]:
titanic_test.head()
Out[3]:
In [5]:
titanic_train.describe()
Out[5]:
In [99]:
titanic_train.describe(include ='O')
Out[99]:
In [38]:
titanic_train.info()
In [37]:
sns.pairplot(data = titanic_test)
Out[37]:
수치형 데이터¶
In [94]:
titanic_train['Survived'].value_counts().plot(kind = 'pie',autopct='%1.1f%%')
Out[94]:
In [487]:
age_means = titanic_train['Age'].mean()
titanic_train['Age'].fillna(age_means, inplace = True)
count, bin_dividers = np.histogram(titanic_train['Age'], bins = 8)
bin_names = ['10세▼','10대','20대','30대','40대','50대','60대','70대▲']
titanic_train['Age_cut'] = pd.cut(titanic_train['Age'], bins = bin_dividers, labels = bin_names, include_lowest=True)
In [488]:
titanic_train['Age_cut'].value_counts()
Out[488]:
In [87]:
fare_means = titanic_train['Fare'].mean()
titanic_train['Fare'].fillna(fare_means, inplace = True)
count, bin_dividers_Fare = np.histogram(titanic_train['Fare'], bins=6)
bin_name = ['85▼','85','170','256','341','426▲']
titanic_train['Fare_cut'] = pd.cut(titanic_train['Fare'], bins = bin_dividers_Fare, labels = bin_name, include_lowest=True)
In [88]:
hist_col = ['Age_cut', 'SibSp','Parch','Fare_cut']
fig, axe = plt.subplots(1,5 , figsize=(20,5))
x = 0
for i in hist_col:
sns.barplot(x = i, y = 'Survived' ,data = titanic_train, ax = axe.flatten()[x])
x+=1
fig.tight_layout()
범주형 데이터¶
In [120]:
titanic_train['Cabin']
Out[120]:
In [163]:
titanic_train['Cabin'] = titanic_train['Cabin'].str[:1]
titanic_train['Embarked'].fillna('S', inplace = True)
titanic_train['Cabin'].fillna('X', inplace = True)
In [165]:
dist_col = ['Sex', 'Cabin', 'Embarked']
ff, axx = plt.subplots(1,3, figsize = (12,5))
x = 0
for i in dist_col:
sns.countplot(x = i,hue = 'Survived', data = titanic_train, ax = axx.flatten()[x])
x+=1
ff.tight_layout()
In [108]:
f, ax = plt.subplots(1,2,figsize=(12,5))
table = titanic_train.pivot_table(index = ['Sex','Survived'], columns =['Pclass'], aggfunc='size')
sns.heatmap(table, annot = True, fmt='d', cmap = 'YlGnBu', linewidth = .5, cbar = False, ax = ax[0])
sns.barplot(x= 'Sex', y='Survived', hue = 'Pclass', data = titanic_train, ax = ax[1])
Out[108]:
In [158]:
fff, axx = plt.subplots(1,4,figsize=(20,5))
sns.barplot(x='Embarked', y='Age',hue='Survived', data=titanic_train, ax=axx[0])
sns.barplot(x='Embarked', y='Fare',hue='Survived', data=titanic_train, ax=axx[1])
sns.barplot(x='Embarked', y='SibSp',hue='Survived', data=titanic_train, ax=axx[2])
sns.barplot(x='Embarked', y='Parch',hue='Survived', data=titanic_train, ax=axx[3])
Out[158]:
In [175]:
fff, axx = plt.subplots(1,4,figsize=(20,5))
sns.barplot(x='Pclass', y='Age',hue='Survived', data=titanic_train, ax=axx[0])
sns.barplot(x='Pclass', y='Fare',hue='Survived', data=titanic_train, ax=axx[1])
sns.barplot(x='Pclass', y='SibSp',hue='Survived', data=titanic_train, ax=axx[2])
sns.barplot(x='Pclass', y='Parch',hue='Survived', data=titanic_train, ax=axx[3])
Out[175]:
In [178]:
sns.lmplot('Fare', 'Survived', data=titanic_train)
Out[178]:
In [224]:
def cleaning(x):
tikets = re.compile('[^a-zA-z+]')
clean_ticket = tikets.sub('', x)
return clean_ticket
titanic_train['Ticket_cut'] = titanic_train['Ticket'].apply(lambda x : cleaning(x))
titanic_train['Ticket_cut'].replace('', 'number', inplace = True)
In [239]:
plt.subplots(1,1,figsize = (40,20))
sns.countplot(x= 'Ticket_cut', hue = 'Survived', data =titanic_train)
Out[239]:
In [247]:
titanic_train['Family'] = titanic_train['SibSp'] + titanic_train['Parch']
table = titanic_train.pivot_table(index = ['Survived'], columns =['Family'], aggfunc='size')
fig, ax = plt.subplots(1,2, figsize=(20,5))
sns.heatmap(table, annot = True, cmap = 'YlGnBu', linewidth = .5, cbar = False, ax = ax[0])
sns.countplot('Family', hue='Survived', data = titanic_train, ax = ax[1])
Out[247]:
In [511]:
titanic_train['Name_cut'] = titanic_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
titanic_train['Name_cut'] = titanic_train['Name_cut'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
titanic_train['Name_cut'] = titanic_train['Name_cut'].replace(['Mlle','Mme','Ms'], 'Other')
titanic_train['Name_cut'].value_counts()
Out[511]:
In [537]:
from sklearn.preprocessing import LabelEncoder
titanic_train = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')
def data_clean(df):
def cleaning(x):
tikets = re.compile('[^a-zA-z+]')
clean_ticket = tikets.sub('', x)
return clean_ticket
df.drop('Name', axis = 1, inplace = True)
df['Age'].fillna(df['Age'].mean(), inplace = True)
df['Cabin'].fillna('X', inplace = True)
df['Embarked'].fillna('S', inplace= True)
for i in range(len(df['Age'])):
if df['Age'][i] <= 10:
df['Age'][i] = 1
elif df['Age'][i] <= 30:
df['Age'][i] = 2
elif df['Age'][i] <= 50:
df['Age'][i] = 3
else:
df['Age'][i] = 4
for i in range(len(df['Fare'])):
if df['Fare'][i] <= 7.910400:
df['Fare'][i] = 1
elif df['Fare'][i] <= 14.454200:
df['Fare'][i] = 2
elif df['Fare'][i] <= 31.000000:
df['Fare'][i] = 3
elif df['Fare'][i] <= 86:
df['Fare'][i] = 4
else:
df['Fare'][i] = 5
df['Cabin'] = df['Cabin'].str[:1]
df['Ticket'] = df['Ticket'].apply(lambda x : cleaning(x))
df['Ticket'].replace('', 'number', inplace = True)
df['Family'] = df['SibSp'] + df['Parch']
return df
def Labelencoding(df):
features = ['Cabin', 'Ticket', 'Embarked', 'Sex']
for feature in features:
Le = LabelEncoder()
le = Le.fit(df[feature])
df[feature] = le.transform(df[feature])
return df
titanic_train = data_clean(titanic_train)
titanic_train = Labelencoding(titanic_train)
titanic_test = data_clean(titanic_test)
titanic_test = Labelencoding(titanic_test)
In [538]:
label = titanic_train['Survived']
titanic_train.drop('Survived', axis = 1 , inplace = True)
X = titanic_train
In [539]:
titanic_test.columns
Out[539]:
In [540]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X,label, test_size = 0.2, random_state = 42)
In [541]:
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, StratifiedKFold, GridSearchCV,RandomizedSearchCV
from xgboost import XGBRFClassifier,XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
In [542]:
model = LogisticRegression(multi_class= 'multinomial', solver = 'lbfgs')
param_grid = { 'C' : [0.1,0.5,1,1.5,5,10],
}
log_reg = GridSearchCV(model, param_grid = param_grid, scoring = 'accuracy', cv=cv, n_jobs=-1)
In [543]:
cv=KFold(n_splits=10, random_state=42)
model=XGBRFClassifier()
param_grid={'booster' :['gbtree'],
'max_depth':[4,6,8],
'gamma':[0,1,2,3],
'n_estimators':[250, 350, 450],
'random_state':[42],
'learning_rate':[0.1]}
xgbrf_clf =GridSearchCV(model, param_grid=param_grid,scoring = 'accuracy',cv = cv, n_jobs=-1)
In [544]:
model=XGBClassifier()
param_grid={'booster' :['gbtree'],
'max_depth':[4,6,8],
'gamma':[0,1,2,3],
'n_estimators':[250, 350, 450],
'random_state':[42],
'learning_rate':[0.1]}
xgb_clf =GridSearchCV(model, param_grid=param_grid,scoring = 'accuracy',cv = cv, n_jobs=-1)
In [545]:
model=RandomForestClassifier()
param_grid={
'max_depth':[4,6,8],
'n_estimators':[250, 350, 450],
'max_features': ['auto'],
'random_state':[42],
}
rf_clf=GridSearchCV(model, param_grid=param_grid,scoring = 'accuracy',cv = cv, n_jobs=-1)
In [546]:
ada_clf_rf = AdaBoostClassifier(algorithm='SAMME.R',base_estimator=RandomForestClassifier(n_estimators=200, max_depth= 4, min_samples_leaf= 2, min_samples_split= 6),
random_state=42,n_estimators=300,
learning_rate = 0.1)
In [547]:
ada_clf_ds = AdaBoostClassifier(algorithm='SAMME.R',base_estimator=DecisionTreeClassifier(max_depth= 4, min_samples_leaf= 2, min_samples_split= 6),
random_state=42,n_estimators=300,
learning_rate = 0.1)
In [548]:
grd_clf = GradientBoostingClassifier(max_depth=2, n_estimators=250)
In [549]:
models=[
('xgbrf_clf', xgbrf_clf),
('xgb_clf', xgb_clf),
('rf_clf', rf_clf),
('ada_clf_rf', ada_clf_rf),
('ada_clf_ds',ada_clf_ds),
('grd_clf',grd_clf),
]
vot_hard = VotingClassifier(estimators=models, voting='hard', n_jobs=-1)
vot_hard.fit(X_train, y_train)
hard_pred = vot_hard.predict(titanic_test)
vot_soft = VotingClassifier(estimators=models, voting='soft', n_jobs=-1)
vot_soft.fit(X_train, y_train)
soft_pred = vot_soft.predict(titanic_test)
In [550]:
a = pd.DataFrame({
'PassengerId' : titanic_test['PassengerId'],
'Survived' : hard_pred
})
a.to_csv('hard_predict1.csv', index = False)
a = pd.DataFrame({
'PassengerId' : titanic_test['PassengerId'],
'Survived' : soft_pred
})
a.to_csv('soft_predict1.csv', index = False)
반응형
'ML | DL > ML DL 프로젝트' 카테고리의 다른 글
강남대학교(KNU) 에브리타임 데이터 분석 (4) | 2022.03.18 |
---|
댓글