東京に棲む日々

データ分析、統計、ITを勉強中。未だ世に出ず。

Python - pandas練習1

以下のデータを読み込み、pandas でデータ加工を行う。

f:id:High_School_Student:20140615155118g:plain

 

import numpy as np
import pandas as pd
from pandas import DataFrame

 

# csvファイルの読み込み
d1 = pd.read_csv('C:パス\sample.csv')

d1

 
       pref      city  city_id           pop  pop_child  debt
0     Tokyo    Meguro       21  1.000000e+04        970     0
1     Tokyo     Ebisu       11  1.400000e+04       2000     0
2  Kanagawa  Kawasaki       19  9.005000e+04       5050     5
3       NaN  Yokohama       34  1.328192e+12      20733    10
4     Chiba     Chiba       17  3.008050e+05       3408    25
5     Chiba    Narita       42           NaN       4571    30

[6 rows x 6 columns]

 

d1.isnull()                            # 欠測の確認

 
    pref   city city_id    pop pop_child   debt
0  False  False   False  False     False  False
1  False  False   False  False     False  False
2  False  False   False  False     False  False
3   True  False   False  False     False  False
4  False  False   False  False     False  False
5  False  False   False   True     False  False

[6 rows x 6 columns]

 

d1.fillna('欠測だった')            # 欠測のfill

 
       pref      city  city_id           pop  pop_child  debt
0     Tokyo    Meguro       21         10000        970     0
1     Tokyo     Ebisu       11         14000       2000     0
2  Kanagawa  Kawasaki       19         90050       5050     5
3     欠測だった  Yokohama       34  1.328192e+12      20733    10
4     Chiba     Chiba       17        300805       3408    25
5     Chiba    Narita       42         欠測だった       4571    30

[6 rows x 6 columns]

 

d1[ d1['pref'].notnull() ]                         # pref列がNoneでない行を取り出す

 
       pref      city  city_id     pop  pop_child  debt
0     Tokyo    Meguro       21   10000        970     0
1     Tokyo     Ebisu       11   14000       2000     0
2  Kanagawa  Kawasaki       19   90050       5050     5
4     Chiba     Chiba       17  300805       3408    25
5     Chiba    Narita       42     NaN       4571    30

[5 rows x 6 columns]

 

d1[ d1['pref'].notnull() & d1['pop'].notnull() ]                       # さらにpop列もNoneでない行の取出し

 
       pref      city  city_id     pop  pop_child  debt
0     Tokyo    Meguro       21   10000        970     0
1     Tokyo     Ebisu       11   14000       2000     0
2  Kanagawa  Kawasaki       19   90050       5050     5
4     Chiba     Chiba       17  300805       3408    25

[4 rows x 6 columns]

 

# 型チェック
type(d1)                             # pandas.core.frame.DataFrame
type(d1['pref'])                    # pandas.core.series.Series
type([d1['pref']])                  # list
type(d1.ix[0,'pref'])             # str
type(d1.ix[0,'pop'])              # numpy.float64
type(d1.ix[0,'debt'])             # numpy.int64

 

d1['pop'] = d1['pop'].fillna(0).astype(np.int64)                 # numpy.int64型に変換(NaNをnumpy.int64型に変換できないので、簡易的に0で穴埋めして変換)

d1

 
       pref      city  city_id            pop  pop_child  debt
0     Tokyo    Meguro       21          10000        970     0
1     Tokyo     Ebisu       11          14000       2000     0
2  Kanagawa  Kawasaki       19          90050       5050     5
3       NaN  Yokohama       34  1328192037651      20733    10
4     Chiba     Chiba       17         300805       3408    25
5     Chiba    Narita       42              0       4571    30

[6 rows x 6 columns]

 

 

d2 = d1.copy()

d2['pref_city'] = d2['pref'].fillna('..') + '-' + d2['city']          # 欠測を補完して、str変数どうしを結合

d2['pop_child_by_pop'] = d2['pop_child'] / d2['pop']                     # 数値変数どうしの計算

d2['pop_child_by_pop_char'] = d2['pop_child'].astype(str) + '/' + d2['pop'].astype(str)                            # 数値型をstrに変換して変数結合

d2

 
       pref      city  city_id            pop  pop_child  debt  \
0     Tokyo    Meguro       21          10000        970     0
1     Tokyo     Ebisu       11          14000       2000     0
2  Kanagawa  Kawasaki       19          90050       5050     5
3       NaN  Yokohama       34  1328192037651      20733    10
4     Chiba     Chiba       17         300805       3408    25
5     Chiba    Narita       42              0       4571    30

           pref_city  pop_child_by_pop pop_child_by_pop_char
0       Tokyo-Meguro      9.700000e-02             970/10000
1        Tokyo-Ebisu      1.428571e-01            2000/14000
2  Kanagawa-Kawasaki      5.607996e-02            5050/90050
3        ..-Yokohama      1.560994e-08   20733/1328192037651
4        Chiba-Chiba      1.132960e-02           3408/300805
5       Chiba-Narita               inf                4571/0

[6 rows x 9 columns]

 

 

# 変数順の入れ替え

d3 = DataFrame(d2, columns=['pref', 'city', 'pref_city', 'city_id', 'pop', 'pop_child', 'pop_child_by_pop', 'pop_child_by_pop_char', 'debt'])

d3

 
       pref      city          pref_city  city_id            pop  pop_child  \
0     Tokyo    Meguro       Tokyo-Meguro       21          10000        970
1     Tokyo     Ebisu        Tokyo-Ebisu       11          14000       2000
2  Kanagawa  Kawasaki  Kanagawa-Kawasaki       19          90050       5050
3       NaN  Yokohama        ..-Yokohama       34  1328192037651      20733
4     Chiba     Chiba        Chiba-Chiba       17         300805       3408
5     Chiba    Narita       Chiba-Narita       42              0       4571

   pop_child_by_pop pop_child_by_pop_char  debt
0      9.700000e-02             970/10000     0
1      1.428571e-01            2000/14000     0
2      5.607996e-02            5050/90050     5
3      1.560994e-08   20733/1328192037651    10
4      1.132960e-02           3408/300805    25
5               inf                4571/0    30

[6 rows x 9 columns]

 

d3['pref'].str.contains('ok')                  # pref変数にstr'ok'が入っているとTrueを返す

 
0     True
1     True
2    False
3      NaN
4    False
5    False
Name: pref, dtype: object

 

d3[ d3['pref'].str.contains('ok') == True ]                            # 上のTrue行を返す(Noneが入っていれば、" == True"を指定しなくてはならない)

 
    pref    city     pref_city  city_id    pop  pop_child  pop_child_by_pop  \
0  Tokyo  Meguro  Tokyo-Meguro       21  10000        970          0.097000
1  Tokyo   Ebisu   Tokyo-Ebisu       11  14000       2000          0.142857

  pop_child_by_pop_char  debt
0             970/10000     0
1            2000/14000     0

[2 rows x 9 columns]

 

以上

 

以下、d1作成コード

data = DataFrame({
'pref': ['Tokyo','Tokyo','Kanagawa',None,'Chiba','Chiba'],
'city': ['Meguro','Ebisu','Kawasaki','Yokohama','Chiba','Narita'],
'city_id': [21,11,19,34,17,42],
'pop': [10000,14000,90050,1328192037651,300805,None],
'pop_child': [970,2000,5050,20733,3408,4571],
'debt': [0,0,5,10,25,30]
}, columns = ['pref','city','city_id','pop','pop_child','debt'])