본문 바로가기
개발일지/SQL

SQL 데이터베이스 데이터 가져오기

by 다니엘의 개발 이야기 2022. 7. 8.
320x100
import pandas as pd
import numpy as np
import pymssql
from mysql import connector

 

query = 'SELECT studentid, school, sex, age, famsize,\
        medu AS mothereducation, fedu AS fatereducation,\
        traveltime, studytime, failures, famrel, freetime,\
        goout, g1 AS gradeperiod1, g2 AS gradeperiod2,\
        g3 AS gradeperiod3 From studentmath'

server = 'pdcc.c9sqqzd5fulv.us-west-2.rds.amazonaws.com'
user = 'pdccuser'
password = 'pdccpass'
database = 'pdcctest'

conn = pymssql.connect(server = server,
                      user =user, password = password, database = database)

studentmath = pd.read_sql(query,conn)
conn.close()

 

studentmath.head()

 

studentmath.dtypes

 

host ='pdccmysql.c9sqqzd5fulv.us-west-2.rds.amazonaws.com'
user = 'pdccuser'
password = 'pdccpass'
database = 'pdccschema'
connmysql = mysql.connector.connect(host=host,
                                   database=database, user=user, password = password)

studentmath = pd.read_sql(query,connmysql)
connmysql.close()

 

newcolorder = ['studentid', 'gradeperiod1', 'gradeperiod2',
              'gradeperiod3', 'school', 'sex', 'age', 'famsize',
              'mothereducation', 'fatereducation', 'traveltime',
              'studytime', 'freetime', 'failures', 'famrel', 'goout']

studentmath = studentmath[newcolorder]

 

# 이걸로 studentid의 갯수를 세어주고
studentmath.studentid.count()

 

# 이걸로 studentid에서 센 숫자가 중복값은 아닌지 확인해준다.
studentmath.studentid.nunique()

# 두개의 값이 같다면, 중복값은 없다고 우선은 생각할 수 있다.

 

studentmath.set_index('studentid', inplace=True)

 

studentmath.count()

 

import numpy as np

setvalues = {'famrel':{1:'1:very bad', 2:'2:bad', 3:'3:neutral',
                      4:'4:good', 5:'5:excellent'},
            'freetime':{1:'1:very low', 2:'2:low', 3:'3:neutral',
                      4:'4:high', 5:'5:very high'},
            'goout':{1:'1:very low', 2:'2:low', 3:'3:neutral',
                    4:'4:high', 5:'5:very high'},
            'mothereducation':{0:np.nan, 1:'1:k-4', 2:'2:5-9',
                              3:'3:secondary ed', 4:'4:higher ed'},
            'fatereducation':{0:np.nan, 1:'1:k-4', 2:'2:5-9',
                             3:'3:secondary ed', 4:'4:higher ed'}}

studentmath.replace(setvalues, inplace=True)
setvalueskeys = [k for k in setvalues]

 

studentmath[setvalueskeys].memory_usage(index=False)

 

for col in studentmath[setvalueskeys].columns:
    studentmath[col] = studentmath[col].astype('category')
    
    
studentmath[setvalueskeys].memory_usage(index=False)

 

studentmath['famrel'].value_counts(sort=False, normalize=True)

 

studentmath[['freetime', 'goout']].\
    apply(pd.Series.value_counts, sort=False, normalize=True)

 

studentmath[['mothereducation', 'fatereducation']].\
    apply(pd.Series.value_counts, sort=False, normalize=True)
300x250