from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))

import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style('dark')
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
font_path = "c:/Windows/Fonts/malgun.ttf"
font_name = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family = font_name)

titanic_train = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')

titanic_train.head()

titanic_test.head()

titanic_train.describe()

titanic_train.describe(include ='O')

titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

sns.pairplot(data = titanic_test)

<seaborn.axisgrid.PairGrid at 0x2858715f688>

수치형 데이터¶

titanic_train['Survived'].value_counts().plot(kind = 'pie',autopct='%1.1f%%')

<matplotlib.axes._subplots.AxesSubplot at 0x2858b02f108>

age_means = titanic_train['Age'].mean()
titanic_train['Age'].fillna(age_means, inplace = True)

count, bin_dividers = np.histogram(titanic_train['Age'], bins = 8)
bin_names = ['10세▼','10대','20대','30대','40대','50대','60대','70대▲']

titanic_train['Age_cut'] = pd.cut(titanic_train['Age'], bins = bin_dividers, labels = bin_names, include_lowest=True)

titanic_train['Age_cut'].value_counts()

20대     407
30대     155
10대     115
50대      86
10세▼     64
60대      42
70대▲     22
40대       0
Name: Age_cut, dtype: int64

fare_means = titanic_train['Fare'].mean()
titanic_train['Fare'].fillna(fare_means, inplace = True)
count, bin_dividers_Fare = np.histogram(titanic_train['Fare'], bins=6)
bin_name = ['85▼','85','170','256','341','426▲']
titanic_train['Fare_cut'] = pd.cut(titanic_train['Fare'], bins = bin_dividers_Fare, labels = bin_name, include_lowest=True)

hist_col = ['Age_cut', 'SibSp','Parch','Fare_cut']
fig, axe = plt.subplots(1,5 , figsize=(20,5))
x = 0
for i in hist_col:
    sns.barplot(x = i, y = 'Survived' ,data = titanic_train, ax = axe.flatten()[x])
    x+=1
fig.tight_layout()

범주형 데이터¶

titanic_train['Cabin']

0         X
1       C85
2         X
3      C123
4         X
       ... 
886       X
887     B42
888       X
889    C148
890       X
Name: Cabin, Length: 891, dtype: object

titanic_train['Cabin'] = titanic_train['Cabin'].str[:1]
titanic_train['Embarked'].fillna('S', inplace = True)
titanic_train['Cabin'].fillna('X', inplace = True)

dist_col = ['Sex', 'Cabin', 'Embarked']
ff, axx = plt.subplots(1,3, figsize = (12,5))
x = 0
for i in dist_col:
    sns.countplot(x = i,hue = 'Survived', data = titanic_train, ax = axx.flatten()[x])
    x+=1
ff.tight_layout()

f, ax = plt.subplots(1,2,figsize=(12,5))
table = titanic_train.pivot_table(index = ['Sex','Survived'], columns =['Pclass'], aggfunc='size')
sns.heatmap(table, annot = True, fmt='d', cmap = 'YlGnBu', linewidth = .5, cbar = False, ax = ax[0])
sns.barplot(x= 'Sex', y='Survived', hue = 'Pclass', data = titanic_train, ax = ax[1])

<matplotlib.axes._subplots.AxesSubplot at 0x2858b969b88>

fff, axx = plt.subplots(1,4,figsize=(20,5))
sns.barplot(x='Embarked', y='Age',hue='Survived', data=titanic_train, ax=axx[0])
sns.barplot(x='Embarked', y='Fare',hue='Survived', data=titanic_train, ax=axx[1])
sns.barplot(x='Embarked', y='SibSp',hue='Survived', data=titanic_train, ax=axx[2])
sns.barplot(x='Embarked', y='Parch',hue='Survived', data=titanic_train, ax=axx[3])

<matplotlib.axes._subplots.AxesSubplot at 0x2858d8bacc8>

fff, axx = plt.subplots(1,4,figsize=(20,5))
sns.barplot(x='Pclass', y='Age',hue='Survived', data=titanic_train, ax=axx[0])
sns.barplot(x='Pclass', y='Fare',hue='Survived', data=titanic_train, ax=axx[1])
sns.barplot(x='Pclass', y='SibSp',hue='Survived', data=titanic_train, ax=axx[2])
sns.barplot(x='Pclass', y='Parch',hue='Survived', data=titanic_train, ax=axx[3])

<matplotlib.axes._subplots.AxesSubplot at 0x2858dfa3188>

sns.lmplot('Fare', 'Survived', data=titanic_train)

<seaborn.axisgrid.FacetGrid at 0x2858d5b5e88>

def cleaning(x):
    tikets = re.compile('[^a-zA-z+]')
    clean_ticket = tikets.sub('', x)
    return clean_ticket

titanic_train['Ticket_cut'] = titanic_train['Ticket'].apply(lambda x : cleaning(x))
titanic_train['Ticket_cut'].replace('', 'number', inplace = True)

plt.subplots(1,1,figsize = (40,20))
sns.countplot(x= 'Ticket_cut', hue = 'Survived', data =titanic_train)

<matplotlib.axes._subplots.AxesSubplot at 0x2858cfd4f48>

titanic_train['Family'] = titanic_train['SibSp'] + titanic_train['Parch']
table = titanic_train.pivot_table(index = ['Survived'], columns =['Family'], aggfunc='size')

fig, ax = plt.subplots(1,2, figsize=(20,5))
sns.heatmap(table, annot = True, cmap = 'YlGnBu', linewidth = .5, cbar = False, ax = ax[0])
sns.countplot('Family', hue='Survived', data = titanic_train, ax = ax[1])

<matplotlib.axes._subplots.AxesSubplot at 0x2858bc20348>

titanic_train['Name_cut'] = titanic_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


titanic_train['Name_cut'] = titanic_train['Name_cut'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
titanic_train['Name_cut'] = titanic_train['Name_cut'].replace(['Mlle','Mme','Ms'], 'Other')
titanic_train['Name_cut'].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Other      27
Name: Name_cut, dtype: int64

from sklearn.preprocessing import LabelEncoder

titanic_train = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')

def data_clean(df):
    
    def cleaning(x):
        tikets = re.compile('[^a-zA-z+]')
        clean_ticket = tikets.sub('', x)
        return clean_ticket
    
    df.drop('Name', axis = 1, inplace = True)
    df['Age'].fillna(df['Age'].mean(), inplace = True)
    df['Cabin'].fillna('X', inplace = True)
    df['Embarked'].fillna('S', inplace= True)
    
    for i in range(len(df['Age'])):
        if df['Age'][i] <= 10:
            df['Age'][i] = 1
        elif df['Age'][i] <= 30:
            df['Age'][i] = 2
        elif df['Age'][i] <= 50:
            df['Age'][i] = 3
        else:
            df['Age'][i] = 4
            
    for i in range(len(df['Fare'])):
        if df['Fare'][i] <= 7.910400:
            df['Fare'][i] = 1
        elif df['Fare'][i] <= 14.454200:
            df['Fare'][i] = 2
        elif df['Fare'][i] <= 31.000000:
            df['Fare'][i] = 3
        elif df['Fare'][i] <= 86:
            df['Fare'][i] = 4
        else:
            df['Fare'][i] = 5
            
    
    df['Cabin'] = df['Cabin'].str[:1]
    
    df['Ticket'] = df['Ticket'].apply(lambda x : cleaning(x))
    df['Ticket'].replace('', 'number', inplace = True)
    
    df['Family'] = df['SibSp'] + df['Parch']
    
    
    
    return df
    
def Labelencoding(df):
    
    features = ['Cabin', 'Ticket', 'Embarked', 'Sex']
    
    for feature in features:
        Le = LabelEncoder()
        le = Le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df



    
titanic_train = data_clean(titanic_train)        
titanic_train = Labelencoding(titanic_train)
titanic_test = data_clean(titanic_test)
titanic_test = Labelencoding(titanic_test)

C:\Users\82109\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\82109\Anaconda3\lib\site-packages\ipykernel_launcher.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\82109\Anaconda3\lib\site-packages\ipykernel_launcher.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\82109\Anaconda3\lib\site-packages\ipykernel_launcher.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\82109\Anaconda3\lib\site-packages\ipykernel_launcher.py:30: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\82109\Anaconda3\lib\site-packages\ipykernel_launcher.py:36: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\82109\Anaconda3\lib\site-packages\ipykernel_launcher.py:32: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\82109\Anaconda3\lib\site-packages\ipykernel_launcher.py:34: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\82109\Anaconda3\lib\site-packages\ipykernel_launcher.py:38: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

label = titanic_train['Survived']
titanic_train.drop('Survived', axis = 1 , inplace = True)
X = titanic_train

titanic_test.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked', 'Family'],
      dtype='object')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X,label, test_size = 0.2, random_state = 42)

from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, StratifiedKFold, GridSearchCV,RandomizedSearchCV

from xgboost import XGBRFClassifier,XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier

model = LogisticRegression(multi_class= 'multinomial', solver = 'lbfgs')

param_grid = { 'C' : [0.1,0.5,1,1.5,5,10],
}

log_reg = GridSearchCV(model, param_grid = param_grid, scoring = 'accuracy', cv=cv, n_jobs=-1)

cv=KFold(n_splits=10, random_state=42)

model=XGBRFClassifier()
param_grid={'booster' :['gbtree'],
                 'max_depth':[4,6,8],
                 'gamma':[0,1,2,3],
                 'n_estimators':[250, 350, 450],
                 'random_state':[42],
                'learning_rate':[0.1]}

xgbrf_clf =GridSearchCV(model, param_grid=param_grid,scoring = 'accuracy',cv = cv, n_jobs=-1)

C:\Users\82109\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:296: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
  FutureWarning

model=XGBClassifier()
param_grid={'booster' :['gbtree'],
                 'max_depth':[4,6,8],
                 'gamma':[0,1,2,3],
                 'n_estimators':[250, 350, 450],
                 'random_state':[42],
                'learning_rate':[0.1]}

xgb_clf =GridSearchCV(model, param_grid=param_grid,scoring = 'accuracy',cv = cv, n_jobs=-1)

model=RandomForestClassifier()
param_grid={     
                 'max_depth':[4,6,8],
                 'n_estimators':[250, 350, 450],
                 'max_features': ['auto'],
                 'random_state':[42],
            }

rf_clf=GridSearchCV(model, param_grid=param_grid,scoring = 'accuracy',cv = cv, n_jobs=-1)

ada_clf_rf = AdaBoostClassifier(algorithm='SAMME.R',base_estimator=RandomForestClassifier(n_estimators=200, max_depth= 4, min_samples_leaf= 2, min_samples_split= 6),
                             random_state=42,n_estimators=300,
                            learning_rate = 0.1)

ada_clf_ds = AdaBoostClassifier(algorithm='SAMME.R',base_estimator=DecisionTreeClassifier(max_depth= 4, min_samples_leaf= 2, min_samples_split= 6),
                             random_state=42,n_estimators=300,
                            learning_rate = 0.1)

grd_clf = GradientBoostingClassifier(max_depth=2, n_estimators=250)

models=[
        ('xgbrf_clf', xgbrf_clf),
        ('xgb_clf', xgb_clf),
        ('rf_clf', rf_clf),
        ('ada_clf_rf', ada_clf_rf),
        ('ada_clf_ds',ada_clf_ds),
        ('grd_clf',grd_clf),
    
]

vot_hard = VotingClassifier(estimators=models, voting='hard', n_jobs=-1)

vot_hard.fit(X_train, y_train)
hard_pred = vot_hard.predict(titanic_test)

vot_soft = VotingClassifier(estimators=models, voting='soft', n_jobs=-1)
vot_soft.fit(X_train, y_train)
soft_pred = vot_soft.predict(titanic_test)

a = pd.DataFrame({
    'PassengerId' : titanic_test['PassengerId'],
    'Survived' : hard_pred
})

a.to_csv('hard_predict1.csv', index = False)

a = pd.DataFrame({
    'PassengerId' : titanic_test['PassengerId'],
    'Survived' : soft_pred
})

a.to_csv('soft_predict1.csv', index = False)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

나의 공부기록

[Kaggle] 타이타닉 생존자 예측 도전

수치형 데이터¶

범주형 데이터¶

'ML | DL > ML DL 프로젝트' 카테고리의 다른 글

댓글

티스토리툴바

개인정보

단축키

내 블로그

블로그 게시글

모든 영역

	Name	Sex	Ticket	Cabin	Embarked
count	891	891	891	204	889
unique	891	2	681	147	3
top	Morley, Mr. William	male	1601	C23 C25 C27	S
freq	1	577	7	4	644

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

[Kaggle] 타이타닉 생존자 예측 도전

수치형 데이터¶

범주형 데이터¶

'ML | DL > ML DL 프로젝트' 카테고리의 다른 글

관련글

댓글

티스토리툴바

개인정보

단축키

내 블로그

블로그 게시글

모든 영역