320x100
import pandas as pd
import numpy as np
import pymssql
from mysql import connector
query = 'SELECT studentid, school, sex, age, famsize,\
medu AS mothereducation, fedu AS fatereducation,\
traveltime, studytime, failures, famrel, freetime,\
goout, g1 AS gradeperiod1, g2 AS gradeperiod2,\
g3 AS gradeperiod3 From studentmath'
server = 'pdcc.c9sqqzd5fulv.us-west-2.rds.amazonaws.com'
user = 'pdccuser'
password = 'pdccpass'
database = 'pdcctest'
conn = pymssql.connect(server = server,
user =user, password = password, database = database)
studentmath = pd.read_sql(query,conn)
conn.close()
studentmath.head()
studentmath.dtypes
host ='pdccmysql.c9sqqzd5fulv.us-west-2.rds.amazonaws.com'
user = 'pdccuser'
password = 'pdccpass'
database = 'pdccschema'
connmysql = mysql.connector.connect(host=host,
database=database, user=user, password = password)
studentmath = pd.read_sql(query,connmysql)
connmysql.close()
newcolorder = ['studentid', 'gradeperiod1', 'gradeperiod2',
'gradeperiod3', 'school', 'sex', 'age', 'famsize',
'mothereducation', 'fatereducation', 'traveltime',
'studytime', 'freetime', 'failures', 'famrel', 'goout']
studentmath = studentmath[newcolorder]
# 이걸로 studentid의 갯수를 세어주고
studentmath.studentid.count()
# 이걸로 studentid에서 센 숫자가 중복값은 아닌지 확인해준다.
studentmath.studentid.nunique()
# 두개의 값이 같다면, 중복값은 없다고 우선은 생각할 수 있다.
studentmath.set_index('studentid', inplace=True)
studentmath.count()
import numpy as np
setvalues = {'famrel':{1:'1:very bad', 2:'2:bad', 3:'3:neutral',
4:'4:good', 5:'5:excellent'},
'freetime':{1:'1:very low', 2:'2:low', 3:'3:neutral',
4:'4:high', 5:'5:very high'},
'goout':{1:'1:very low', 2:'2:low', 3:'3:neutral',
4:'4:high', 5:'5:very high'},
'mothereducation':{0:np.nan, 1:'1:k-4', 2:'2:5-9',
3:'3:secondary ed', 4:'4:higher ed'},
'fatereducation':{0:np.nan, 1:'1:k-4', 2:'2:5-9',
3:'3:secondary ed', 4:'4:higher ed'}}
studentmath.replace(setvalues, inplace=True)
setvalueskeys = [k for k in setvalues]
studentmath[setvalueskeys].memory_usage(index=False)
for col in studentmath[setvalueskeys].columns:
studentmath[col] = studentmath[col].astype('category')
studentmath[setvalueskeys].memory_usage(index=False)
studentmath['famrel'].value_counts(sort=False, normalize=True)
studentmath[['freetime', 'goout']].\
apply(pd.Series.value_counts, sort=False, normalize=True)
studentmath[['mothereducation', 'fatereducation']].\
apply(pd.Series.value_counts, sort=False, normalize=True)
300x250
'개발일지 > 임시카테고리' 카테고리의 다른 글
python 정규표현식 re로 문자열 안의 값을 꺼내기, 변경해주기 (0) | 2022.07.08 |
---|---|
Python 기본용어 원리 (strip, split, join, upper, lower, capitalize) (0) | 2022.07.08 |
pymssql 설치에러 해결! (0) | 2022.07.08 |
pandas excel 가져오기 루틴 (0) | 2022.07.08 |
pandas로 데이터 가져오기 루틴 1 (csv파일 읽기) (0) | 2022.07.08 |