Äú¿ÉÒÔ¾èÖú£¬Ö§³ÖÎÒÃǵĹ«ÒæÊÂÒµ¡£

1Ôª 10Ôª 50Ôª





ÈÏÖ¤Â룺  ÑéÖ¤Âë,¿´²»Çå³þ?Çëµã»÷Ë¢ÐÂÑéÖ¤Âë ±ØÌî



  ÇóÖª ÎÄÕ ÎÄ¿â Lib ÊÓÆµ iPerson ¿Î³Ì ÈÏÖ¤ ×Éѯ ¹¤¾ß ½²×ù Model Center   Code  
»áÔ±   
   
 
     
   
 ¶©ÔÄ
  ¾èÖú
ÀûÓÃPython½øÐÐÊý¾Ý·ÖÎö¡ª¡ª»ù´¡Ê¾Àý
 
  15466  次浏览      29
 2019-3-28 
 
±à¼­ÍƼö:
À´Ô´csdn ,ÎÄÕÂͨ¹ýÃÀ¹ú¹Ù·½ÍøÕ¾µÄ¼¸¸ö°¸ÀýÏêϸ½²½âÁËPythonÊý¾Ý·ÖÎö£¬½éÉܽÏΪÏêϸ£¬¸ü¶àÄÚÈÝÇë²ÎÔÄÏÂÎÄ¡£


import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

1.USA.gov Data from Bitly

´ËÊý¾ÝÊÇÃÀ¹ú¹Ù·½ÍøÕ¾´ÓÓû§ÄÇËѼ¯µ½µÄÄäÃûÊý¾Ý¡£

path='datasets/bitly_usagov/example.txt'
data=[json.loads(line) for line in open(path)]
df=pd.DataFrame(data)

df.info()

tz×ֶΰüº¬µÄÊÇÊ±ÇøÐÅÏ¢¡£

df.loc[:,'tz'].value_counts()[:10]

¸ù¾Ýinfo()Óëvalue_counts()µÄ·µ»Ø½á¹ûÀ´¿´£¬tzÁдæÔÚȱʧֵÓë¿ÕÖµ£¬Ê×ÏÈÌî³äȱʧֵ£¬È»ºó´¦Àí¿ÕÖµ£º

clean_tz=df.loc[:,'tz'].fillna('missing')
clean_tz.loc[clean_tz=='']='unkonwn'
clean_tz.value_counts()[:5]

plt.clf()
subset=clean_tz.value_counts()[:10]
subset.plot.barh()
plt.show()

a×ֶΰüº¬µÄÊÇä¯ÀÀÆ÷¡¢É豸ÓëÓ¦ÓõÈÐÅÏ¢¡£

df.loc[:,'a'].sample(10)

¼ÙÉèÎÒÃÇÐèҪͳ¼ÆwindowsÓë·ÇwindowsµÄÏà¹ØÁ¿£¬ÎÒÃÇҪץȡa×Ö¶ÎÖеġ¯Windows¡¯×Ö·û´®¡£ÒòΪa×Ö¶ÎͬÑù´æÔÚȱʧֵ£¬ÕâÀïÎÒÃÇÑ¡Ôñ¶ªÆúȱʧֵ£º

clean_df=df[df.loc[:,'a'].notnull()]
mask=clean_df.loc[:,'tz']==''
clean_df.loc[:,'tz'].loc[mask]='unkonwn'
mask=clean_df.loc[:,'a'].str.contains('Windows')
clean_df.loc[:,'os']=np.where(mask,'Windows','not Windows')
clean_df.drop('a',axis=1,inplace=True)

by_tz_os=clean_df.groupby(['tz','os'])
tz_os_counts=by_tz_os.size().unstack().fillna(0)
indexer=tz_os_counts.sum(axis=1).argsort() #·µ»ØÅÅÐòºóµÄË÷ÒýÁбí
tz_os_counts_subset=tz_os_counts.take(indexer[-10:]) #È¡µÃË÷ÒýÁбíµÄºóÊ®Ìõ
tz_os_counts_subset

plt.clf()
tz_os_counts_subset.plot.barh()
plt.show()

ÒòΪ²»Í¬µØÇøµÄÊýÁ¿²îÒìÐüÊ⣬Èç¹ûÎÒÃÇÒª¸üÇå³þµÃ²é¿´ÏµÍ³²îÒ죬»¹ÐèÒª½«Êý¾Ý½øÐйéÒ»»¯£º

tz_os_counts_subset_norm = tz_os_counts_subset.values / tz_os_counts_subset.sum (axis=1).values.reshape (10,1) #ת»»³ÉnumpyÊý×éÀ´¼ÆËã°Ù·Ö±È
tz_os_counts_subset_norm= pd.DataFrame (tz_os_counts_subset_norm,
index= tz_os_counts_subset.index,
columns= tz_os_counts_subset.columns)

 

plt.clf()
tz_os_counts_subset_norm.plot.barh()
plt.show()

# MovieLens

rating_col=['UserID','MovieID','Rating','Timestamp']
user_col=['UserID','Gender','Age','Occupation','Zip-code']
movie_col=['MovieID','Title','Genres']
ratings=pd.read_table ('datasets/movielens/ratings.dat', header=None,sep='::',names=rating_col,engine='python')
users=pd.read_table ('datasets/movielens/users.dat', header=None,sep='::',names=user_col,engine='python')
movies=pd.read_table ('datasets/movielens/movies.dat', header=None,sep='::',names=movie_col,engine='python')

ratings.sample(3)

users.sample(3)

 

movies.sample(3)

 

data=pd.merge(pd.merge(ratings,users),movies)
data.sample(3)

data.info()

¼ÓÈëÐèÒª»ñµÃ²»Í¬ÐÔ±ð¶ÔÓÚ¸÷µçÓ°µÄƽ¾ù´ò·Ö£¬Ê¹ÓÃ͸ÊÓ±í¾Í¿ÉÒÔÖ±½ÓµÃµ½½á¹û£º

mean_ratings= data.pivot_table ('Rating',index='Title', columns='Gender', aggfunc='mean')
mean_ratings[:5]

µçÓ°ÖÐ»á´æÔÚÀäÃÅ×÷Æ·£¬ÎÒÃÇ¿´Ò»ÏÂÆÀ·ÖÊý¾ÝÖи÷µçÓ°±»ÆÀ¼ÛµÄ´ÎÊý¶¼ÓжàÉÙ£º

by_title=data.groupby('Title').size()
by_title.describe()

ÎÒÃÇÒÔ¶þ·ÖλµãΪ·Ö¸îÏߣ¬È¡³öÆÀ·ÖÊýÁ¿ÔÚ¶þ·ÖλµãÖ®ÉϵĵçÓ°£º

mask=by_title>=250 #×¢Òâby_titleÊÇÒ»¸öSeries
active_titles=by_title.index[mask]
mean_ratings=mean_ratings.loc[active_titles,:]

 

 

ÏÂÃæÁгöÅ®ÐÔ¹ÛÖÚ×îϲ°®µÄµçÓ°£º

top_female_tarings= mean_ratings.sort_values (by='F',ascending=False)[:10]
top_female_tarings

ÏÂÃæÀ´¿´Ò»ÏÂÄÐÅ®¶ÔÓÚ¸÷ӰƬµÄÆÀ·Ö²îÒ죺

mean_ratings.loc[:,'diff'] =mean_ratings.loc[:,'F']-mean_ratings.loc[:,'M']
sorted_by_diff=mean_ratings.sort_values(by='diff',ascending=False)
sorted_by_diff[:10]

sorted_by_diff[-10:]

 

½ÓÏÂÀ´ÎÒÃÇͳ¼ÆÄÇЩÆÀ·ÖÕùÒé½Ï´óµÄӰƬ£¬ratingµÄ·½²îÔ½´ó˵Ã÷ÕùÒéÔ½´ó£º

rating_std=data.pivot_table ('Rating',index='Title',aggfunc='std' ).loc[ active_titles,:]
rating_std.sort_values(by= 'Rating',ascending=False)[:10]

# US Baby Names

years=range(1880,2017)
subsets=[]
column=['name','gender','number']
for year in years:
path='datasets/babynames/yob{}.txt'.format(year)
df=pd.read_csv(path,header=None,names=column)
df.loc[:,'year']=year #´Ë´¦×¢ÒâyearÕâÒ»ÁеÄֵΪÕûÊýÀàÐÍ
subsets.append(df)
names=pd.concat(subsets,ignore_index=True) #Æ´½Ó¶à¸ödf²¢ÖØÐ±àÅÅÐкÅ

names.info()

 

names.sample(5)!

ÎÒÃÇÏȸù¾Ý´ËÊý¾ÝÀ´´óÖ¹۲ìÒ»ÏÂÿÄêµÄÄÐÅ®³öÉúÇé¿ö£º

birth_by_gender=pd.pivot_table (names,values='number', index='year', columns='gender',aggfunc='sum')
plt.clf()
birth_by_gender.plot(title='Total births by sex and year')
plt.show()

ÎÒÃÇÔÚÊý¾ÝÖÐÔö¼ÓÒ»¸ö±ÈÀýϵÊý£¬Õâ¸ö±ÈÀýÄÜÏÔʾij¸öÃû×ÖÔÚÕâÒ»ÄêÄÚռij¸öÐÔ±ðµÄ±ÈÀý£º

def add_prop(group):
group.loc[:,'prop']= group.loc[:,'number']/group.loc [:,'number'].sum()
return group

names_with_prop=names.groupby(['year','gender']).apply(add_prop) #×¢ÒâgroupbyÓëpivot_tableµÄÇø±ð
names_with_prop.groupby(['year','gender'])['prop'].sum()[:6] #ÕýÈ·ÐÔ¼ì²é,×¢ÒâgroupbyÓëpivot_tableµÄÇø±ð

ÏÂÃæÈ¡³ö°´yearÓëgender·Ö×éºóµÄ×îÊÜ»¶Ó­µÄǰ100¸öÃû×Ö£º

def get_top(group,n=100):
return group.sort_values(by='number',ascending=False)[:n]

groupby_obj=names_with_prop.groupby(['year','gender'])
top100=groupby_obj.apply(get_top)
top100.reset_index(drop=True,inplace=True) #¶ªÆúÒò·Ö×é²úÉúµÄÐÐË÷Òý
top100[:5]

½ÓÏÂÀ´ÎÒÃÇʹÓÃÕâЩ×î³£¼ûµÄÃû×ÖÀ´×ö¸üÉîÈëµÄ·ÖÎö£º

total_birth=pd.pivot_table(top100,values='number', index='year', columns='name')
total_birth.fillna (0,inplace=True)

 

ÎÒÃÇѡȡ¼¸¸ö·Ç³£¾ßÓдú±íÐÔµÄÃû×Ö£¬À´¹Û²ìÕâЩÃû×Ö¸ù¾ÝÄê·ÝµÄ±ä»¯Ç÷ÊÆ£º

subset=total_birth.loc[:,['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,title='Number of birth per year')
plt.show()

¿ÉÒÔ¿´³öÕ⼸¸öÃû×ÖÔÚÌØ¶¨µÄʱÆÚ³öÏÖÁ˾®ÅçÏÖÏ󣬵«Ô½¿¿½üÏÖÔÚµÄʱ¼ä¶Î£¬ÕâЩÃû×Ö³öÏֵįµÂÊÔ½µÍ£¬Õâ¿ÉÄÜ˵Ã÷¼Ò³¤ÃǸø±¦±¦ÆðÃû×Ö²»ÔÙËæ´óÁ÷¡£ÏÂÃæÀ´ÑéÖ¤Õâ¸öÏë·¨£º

»ù±¾Ë¼ÏëÊÇʹÓÃÃû×ÖÆµÂʵķÖλÊý£¬Êý¾ÝµÄ·ÖλÊýÄÜ´óÖÂÌåÏÖ³öÊý¾ÝµÄ·Ö²¼£¬Èç¹ûÊý¾ÝÔÚijһ¶ÎÌØ±ðÃܼ¯£¬ÔòijÁ½¸ö·ÖλÊý¿Ï¶¨¿¿µÄÌØ±ð½ü£¬»òÕß·ÖλÊýµÄÐòºÅ»áÆ«Àë±ê×¼Öµ·Ç³£Ô¶¡£

ÏÈÒÔÄк¢ÎªÀý£¬È¡Á½¸öÄê·ÝÀ´¼òµ¥ÑéÖ¤ÏÂÒÔÉϲÂÏ룺

boys=top100[top100.loc[:,'gender']=='M']
boys[boys.loc[:,'year']==1940].sort_values (by='prop').loc[:,'prop'].describe()

ÓÉÉÏÊöÊý¾Ý¿ÉÒÔ¿´µ½£¬propµÄ×î´óֵΪ0.05£¬ËµÃ÷×î³£¼ûµÄÃû×ֵĿɹ۲âÂÊΪ5%£¬¶øÇÒpropµÄ¾ùÖµ´¦ÓÚ[75%,max]Çø¼äÄÚ£¬ËµÃ÷¾ø´ó¶àÊýµÄÐÂÉú¶ù¹²ÏíÒ»¸öºÜСµÄÃû×ֳء£

boys[boys.loc[:,'year'] ==2016].sort_values(by='prop' ).loc[:,'prop'].describe()

ÔÚ2016Ä꣬propµÄ×î´óÖµ½µµ½ÁË0.01£¬¾ùÖµ´¦ÓÚ[50%,75%]Çø¼äÄÚ£¬Õâ˵Ã÷ÐÂÉú¶ùµÄÈ¡Ãû¸ü¶àÑù»¯ÁË¡£

ÏÂÃæÎÒÃÇÀ´¼ÆËãÕ¼¾ÝÐÂÉú¶ùǰ25%µÄÃû×ÖÊýÁ¿£º

def get_quantile_index(group,q=0.25):
group=group.sort_values(by='prop',ascending=False)
sorted_arr=group.loc[:,'prop'].cumsum().values
index=sorted_arr.searchsorted(0.25)+1 #0ΪÆðʼµÄË÷Òý
return index

diversity=top100.groupby (['year','gender']).apply (get_quantile_index)
diversity=diversity.unstack()

plt.clf()
diversity.plot(title='number of popular names in top 25%')
plt.show()

¿ÉÒÔÃ÷ÏÔ¿´³öʱ¼äÏßÔ½¿¿½üÏÖÔÚ£¬Ç°25%µÄÐÂÉú¶ùÃû×ÖÊýÁ¿Ò²Ô½¶à£¬Õâȷʵ˵Ã÷¼Ò³¤ÃǸø±¦±¦ÆðÃû×Ö¸ü¶àÑù»¯ÁË¡£²¢ÇÒ»¹×¢Ò⵽Ůº¢Ãû×ÖµÄÊýÁ¿×ÜÊǶàÓÚÄк¢¡£

ÏÂÃæ·ÖÎöÃû×ÖµÄ×îºóÒ»¸ö×Öĸ£º

get_last_letter=lambda x:x[-1]
last_letters=names.loc[:,'name'].map(get_last_letter) #·µ»ØÒ»¸öSeries
last_letters.name='last_letter'
letter_table=pd.pivot_table (names,values='number' ,index=last_letters,columns= ['gender','year'],aggfunc='sum')
letter_table.fillna(0,inplace=True)

 

È¡³öÈý¸öÄê·ÝÀ´½øÐдÖÂÔ·ÖÎö£º

subset=letter_table.reindex (columns=[1910,1960,2010], level='year') #ÖØË÷Òý
subset.fillna(0,inplace=True)
letter_prop_subset=subset/subset.sum(axis=0)

 

plt.clf()
fig,axes=plt.subplots(2,1,figsize=(10,8))
letter_prop_subset.loc [:,'M'].plot (kind='bar',rot=0,ax=axes[0],title='Boy')
letter_prop_subset.loc [:,'F'].plot (kind='bar',rot=0,ax=axes[1],title='Girl')
plt.show()

´ÓÉÏÃæµÄ´ÖÂÔ·ÖÎö¿ÉÒÔ¿´µ½¼¸¸öÃ÷ÏÔµÄÇé¿ö£º

- ÔÚboyµÄÊý¾ÝÀÒÔ×ÖĸnΪ½áβµÄÃû×ÖÔÚ1960Äêºó³öÏÖÁ˱¬Õ¨Ê½Ôö³¤

- ¶Ôgirl¶øÑÔ£¬×Öĸa½áβµÄÃû×ֽϳ£¼û£¬¶ø×Öĸe½áβµÄÃû×ÖÔòÔ½À´Ô½ÉÙ

ÏÂÃæ·Ö±ðÕë¶ÔboyÓëgirlÌôÑ¡³ö×î³£¼ûµÄÃû×Öβ×Öĸ£¬»æÖƳöÕâЩ×ÖĸÒÔËæÊ±¼äµÄ±ä»¯ÇúÏߣº

letter_prop=letter_table/letter_table.sum(axis=0)
boy_letter=letter_prop.loc[['d','n','y'],'M']
boy_letter_ts=boy_letter.T
girl_letter=letter_prop.loc[['a','e','y'],'F']
girl_letter_ts=girl_letter.T

plt.clf()
fig,axes=plt.subplots(2,1,figsize=(10,8))
boy_letter_ts.plot(ax=axes[0],title='Boy')
girl_letter_ts.plot(ax=axes[1],title='Girl')
plt.show()

¸ù¾ÝÒ»¸öÓÐȤµÄ·¢ÏÖ£¬±íÃ÷ÓÐЩÄк¢µÄÃû×ÖÕýÖð½¥×ªÏò±»¸ü¶àµÄÅ®º¢Ê¹Ó㬱ÈÈç˵LesleyºÍLeslie£¬ÏÂÃæ¾Íɸѡ³ö°üº¬leslµÄÃû×ÖÀ´ÑéÖ¤Õâ¸ö˵·¨£º

uni_names=names.loc[:,'name'].unique() #·µ»ØÒ»¸önumpyÊý×é
uni_names=pd.Series(uni_names)
mask=uni_names.str.lower().str.contains('lesl') #ser->str->ser->str-bool_ser
lesl=uni_names[mask]

mask=names.loc[:,'name'].isin(lesl)
lesl_subset=names[mask]

lesl_table=pd.pivot_table (lesl_subset, values='number', index='year', columns='gender', aggfunc='sum')
lesl_table.fillna(0,inplace=True)
lesl_table.loc[:,'M_prop']= lesl_table.loc[:,'M']/lesl_table.sum(axis=1)
lesl_table.loc[:,'F_prop']= lesl_table.loc[:,'F']/lesl_table.sum(axis=1)

plt.clf()
lesl_table.loc[:, ['M_prop','F_prop']].plot( style={'M_prop':'k-','F_prop':'k--'})
plt.show()

 

USDA Food Database

db=json.load(open('datasets/usda_food/database.json'))
len(db)

6636

db[0]

 

ÕâÀïÿ¸öÌõÄ¿°üº¬µÄÐÅϢ̫¶à£¬²»¸ø³ö½ØÍ¼ÁË¡£

¿ÉÒÔ¿´µ½Êý¾ÝÖÐÿ¸öÌõÄ¿°üº¬ÒÔÏÂÐÅÏ¢£º

- description

- group

- id

- manufacturer

- nutrients£ºÓªÑø³É·Ö£¬×ÖµäµÄÁбí

- portions

- tags

ÒòΪnutrientsÏîÊÇÒ»¸ö×ÖµäµÄÁÐ±í£¬Èç¹û½«dbÖ±½Óת»¯ÎªdataframeµÄ»°ÕâÒ»Ïî¾Í»á±»¹éµ½Ò»¸öÁÐÖУ¬·Ç³£Óµ¼·¡£ÎªÁ˱ãÓÚÀí½â£¬´´½¨Á½¸ödf£¬Ò»¸ö°üº¬³ýÁËnutrientsÖ®ÍâµÄʳÎïÐÅÏ¢£¬¶øÁíÒ»¸ö°üº¬idÓënutrientsÐÅÏ¢£¬È»ºóÔÙ½«Á½Õ߸ù¾ÝidºÏ²¢¡£

keys=['description','group','id']
food_df=pd.DataFrame(db,columns=keys)

df.info()

food_df.sample(5)

 

subsets=[]
for item in db:
id=item['id']
df=pd.DataFrame(item['nutrients'])
df.loc[:,'id']=id
subsets.append(df)
nutrients_df=pd.concat(subsets,ignore_index=True)
nutrients_df.drop_duplicates(inplace=True)

nutrients_df.info()

 

nutrients_df.head()

¹Û²ìµ½Á½¸ö±íÖгöÏÖÁËͬÑùµÄÁÐË÷Òý£¬ÎªÁ˺ϲ¢±íʱ²»³öÏÖì¶Ü£¬¸ü¸ÄÁÐË÷ÒýÃû³Æ£º

fd_col_map={
'description':'food',
'group':'fd_cat'
}
food_df=food_df.rename(columns=fd_col_map)
nt_col_map={
'description':'nutrient',
'group':'nt_cat'
}
nutrients_df=nutrients_df.rename(columns=nt_col_map)

print('{}\n{}'.format (food_df.columns,nutrients_df.columns))

 

data=pd.merge(food_df,nutrients_df,on='id',how='outer')

data.head()

×¢ÒâÕâ¸ö±íÖУ¬Î¨Ò»¾ßÓÐͳ¼ÆÒâÒåµÄÖµÊÇvalueÁУ¬ÆäÓà¶¼ÊÇÃèÊöÐÔÐÅÏ¢¡£

¼ÙÉèÏÖÔÚÐèҪͳ¼ÆÄÄÖÖʳÎïÀà±ðÓµÓеÄÓªÑøÁ¿¾ùÖµ£¬¿ÉÒÔÏȽ«±í¶ÔnutrientÓëfd_cat½øÐзÖ×飬ÔÙ½øÐÐÅÅÐòÊä³ö£º

nt_result=data.loc [:,'value'].groupby ([data.loc[:,'nutrient'], data.loc[:,'fd_cat']]).mean()

plt.clf()
nt_result.loc ['Protein'].sort_values().plot(kind='barh') #°´µ°°×Öʺ¬Á¿¾ùÖµ»æÖÆÍ¼ÐÎ
plt.show()

2012 Federal Election Commission Database

fec=pd.read_csv('datasets/fec/P00000001-ALL.csv',low_memory=False) #±ÜÃ⾯¸æ

fec.info()

×¢Òâµ½Êý¾ÝÖÐûÓкòÑ¡ÈËËùÊôµÄµ³ÅÉÕâÒ»ÐÅÏ¢£¬ËùÒÔ¿ÉÒÔ¿¼ÂÇÈËΪ¼ÓÉÏÕâÒ»ÐÅÏ¢¡£Ê×ÏÈͳ¼Æ³öÊý¾ÝÖÐÓжàÉÙλºòÑ¡ÈË£º

fec.loc[:,'cand_nm'].unique()

 

nm2pt={
'Bachmann, Michelle': 'Republican',
'Romney, Mitt': 'Republican',
'Obama, Barack': 'Democrat',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Pawlenty, Timothy': 'Republican',
'Johnson, Gary Earl': 'Republican',
'Paul, Ron': 'Republican',
'Santorum, Rick': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Huntsman, Jon': 'Republican',
'Perry, Rick': 'Republican',
}
fec.loc[:,'cand_pt']=fec.loc[:,'cand_nm'].map(nm2pt)

 

fec.loc[:,'cand_pt'].value_counts()

¾Ý˵ÓÐÒ»¸öÏÖÏó£¬ÂÉʦ»áÇãÏòÓÚ¾è¸øÃñÖ÷µ³£¬¶ø¾­¼ÃÈËÊ¿»áÇãÏòÓÚ¾è¸ø¹²ºÍµ³£¬ÏÂÃæ¾ÍÀ´ÑéÖ¤Õâһ˵·¨£º

fec.loc[:,'contbr_occupation'].value_counts()[:10]

occ_map={
'INFORMATION REQUESTED PER BEST EFFORTS':'UNKNOW',
'INFORMATION REQUESTED':'UNKNOW',
'C.E.O.':'CEO' #ÕâÒ»ÌõÊÇÔÚºóÃæ·ÖÎöÖз¢ÏÖµÄÏî
}
f=lambda x:occ_map.get(x,x) #»ñÈ¡x¶ÔÓ¦µÄvalue,Èç¹ûûÓжÔÓ¦µÄvalueÔò·µ»Øx

fec.loc[:,'contbr_occupation'] =fec.loc [:,'contbr_occupation'].map(f)
by_occupation=pd.pivot_table (fec,values ='contb_receipt_amt', index= 'contbr_occupation', columns='cand_pt',aggfunc='sum')
by_occupation.fillna(0,inplace=True)
by_occupation.sample(5)

by_occupation.describe()

¿´³ö¾èÏ×½ð¶î·Ö²¼µÄ¼«¶È²»Æ½ºâ£¬ÎÒÃÇֻѡ³ö×ÜÊý´óÓÚ5e6µÄÌõÄ¿£º

mask=by_occupation.sum(axis=1)>5e6
over5mm=by_occupation[mask]
over5mm

plt.clf()
over5mm.plot(kind='barh')
plt.show()

ÏÂÃæÎÒÃǶÔObama BarackÓëRomney MittµÄÊý¾Ý½øÐзÖÎö£º

mask=fec.loc[:,'cand_nm'].isin (['Obama, Barack','Romney, Mitt'])
fec_subset=fec[mask]

 

¼ÙÉèÐèÒª·Ö±ðͳ¼Æ³ö¶ÔÕâÁ½¸öÈËÖ§³Ö×î´óµÄ¸÷Ö°Òµ£¬¿ÉÒÔÕâÑù×ö£º

def get_top(group,key,n=5):
totals=group.groupby(key)['contb_receipt_amt'].sum()
return totals.nlargest(n)

grouped=fec_subset.groupby('cand_nm')
grouped.apply(get_top,'contbr_occupation',5)

ÏÂÃæ¿´¸÷ÖݶÔÁ½È˵ÄÖ§³ÖÇé¿ö£º

by_stat=fec_subset.groupby(['cand_nm','contbr_st'])['contb_receipt_amt'].sum(axes=0)
mask=by_stat>5e6
by_stat=by_stat[mask]

by_stat

 

 

   
15466 ´Îä¯ÀÀ       29
Ïà¹ØÎÄÕÂ

ÊÖ»úÈí¼þ²âÊÔÓÃÀýÉè¼ÆÊµ¼ù
ÊÖ»ú¿Í»§¶ËUI²âÊÔ·ÖÎö
iPhoneÏûÏ¢ÍÆËÍ»úÖÆÊµÏÖÓë̽ÌÖ
AndroidÊÖ»ú¿ª·¢£¨Ò»£©
Ïà¹ØÎĵµ

Android_UI¹Ù·½Éè¼Æ½Ì³Ì
ÊÖ»ú¿ª·¢Æ½Ì¨½éÉÜ
androidÅÄÕÕ¼°ÉÏ´«¹¦ÄÜ
Android½²ÒåÖÇÄÜÊÖ»ú¿ª·¢
Ïà¹Ø¿Î³Ì

Android¸ß¼¶Òƶ¯Ó¦ÓóÌÐò
Androidϵͳ¿ª·¢
AndroidÓ¦Óÿª·¢
ÊÖ»úÈí¼þ²âÊÔ