±à¼ÍƼö: |
À´Ô´csdn
,ÎÄÕÂͨ¹ýÃÀ¹ú¹Ù·½ÍøÕ¾µÄ¼¸¸ö°¸ÀýÏêϸ½²½âÁËPythonÊý¾Ý·ÖÎö£¬½éÉܽÏΪÏêϸ£¬¸ü¶àÄÚÈÝÇë²ÎÔÄÏÂÎÄ¡£ |
|
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt |
1.USA.gov Data from Bitly
´ËÊý¾ÝÊÇÃÀ¹ú¹Ù·½ÍøÕ¾´ÓÓû§ÄÇËѼ¯µ½µÄÄäÃûÊý¾Ý¡£
path='datasets/bitly_usagov/example.txt'
data=[json.loads(line) for line in open(path)]
df=pd.DataFrame(data) |

tz×ֶΰüº¬µÄÊÇÊ±ÇøÐÅÏ¢¡£
df.loc[:,'tz'].value_counts()[:10] |

¸ù¾Ýinfo()Óëvalue_counts()µÄ·µ»Ø½á¹ûÀ´¿´£¬tzÁдæÔÚȱʧֵÓë¿ÕÖµ£¬Ê×ÏÈÌî³äȱʧֵ£¬È»ºó´¦Àí¿ÕÖµ£º
clean_tz=df.loc[:,'tz'].fillna('missing')
clean_tz.loc[clean_tz=='']='unkonwn'
clean_tz.value_counts()[:5] |

plt.clf()
subset=clean_tz.value_counts()[:10]
subset.plot.barh()
plt.show() |

a×ֶΰüº¬µÄÊÇä¯ÀÀÆ÷¡¢É豸ÓëÓ¦ÓõÈÐÅÏ¢¡£

¼ÙÉèÎÒÃÇÐèҪͳ¼ÆwindowsÓë·ÇwindowsµÄÏà¹ØÁ¿£¬ÎÒÃÇҪץȡa×Ö¶ÎÖеġ¯Windows¡¯×Ö·û´®¡£ÒòΪa×Ö¶ÎͬÑù´æÔÚȱʧֵ£¬ÕâÀïÎÒÃÇÑ¡Ôñ¶ªÆúȱʧֵ£º
clean_df=df[df.loc[:,'a'].notnull()]
mask=clean_df.loc[:,'tz']==''
clean_df.loc[:,'tz'].loc[mask]='unkonwn'
mask=clean_df.loc[:,'a'].str.contains('Windows')
clean_df.loc[:,'os']=np.where(mask,'Windows','not
Windows')
clean_df.drop('a',axis=1,inplace=True)
|
by_tz_os=clean_df.groupby(['tz','os'])
tz_os_counts=by_tz_os.size().unstack().fillna(0)
indexer=tz_os_counts.sum(axis=1).argsort() #·µ»ØÅÅÐòºóµÄË÷ÒýÁбí
tz_os_counts_subset=tz_os_counts.take(indexer[-10:])
#È¡µÃË÷ÒýÁбíµÄºóÊ®Ìõ
tz_os_counts_subset
|

plt.clf()
tz_os_counts_subset.plot.barh()
plt.show() |

ÒòΪ²»Í¬µØÇøµÄÊýÁ¿²îÒìÐüÊ⣬Èç¹ûÎÒÃÇÒª¸üÇå³þµÃ²é¿´ÏµÍ³²îÒ죬»¹ÐèÒª½«Êý¾Ý½øÐйéÒ»»¯£º
tz_os_counts_subset_norm = tz_os_counts_subset.values / tz_os_counts_subset.sum (axis=1).values.reshape (10,1)
#ת»»³ÉnumpyÊý×éÀ´¼ÆËã°Ù·Ö±È
tz_os_counts_subset_norm= pd.DataFrame (tz_os_counts_subset_norm,
index= tz_os_counts_subset.index,
columns= tz_os_counts_subset.columns)
|
plt.clf()
tz_os_counts_subset_norm.plot.barh()
plt.show() |

# MovieLens
rating_col=['UserID','MovieID','Rating','Timestamp']
user_col=['UserID','Gender','Age','Occupation','Zip-code']
movie_col=['MovieID','Title','Genres']
ratings=pd.read_table ('datasets/movielens/ratings.dat', header=None,sep='::',names=rating_col,engine='python')
users=pd.read_table ('datasets/movielens/users.dat', header=None,sep='::',names=user_col,engine='python')
movies=pd.read_table ('datasets/movielens/movies.dat', header=None,sep='::',names=movie_col,engine='python')
|



data=pd.merge(pd.merge(ratings,users),movies)
data.sample(3) |


¼ÓÈëÐèÒª»ñµÃ²»Í¬ÐÔ±ð¶ÔÓÚ¸÷µçÓ°µÄƽ¾ù´ò·Ö£¬Ê¹ÓÃ͸ÊÓ±í¾Í¿ÉÒÔÖ±½ÓµÃµ½½á¹û£º
mean_ratings= data.pivot_table ('Rating',index='Title', columns='Gender', aggfunc='mean')
mean_ratings[:5] |

µçÓ°ÖÐ»á´æÔÚÀäÃÅ×÷Æ·£¬ÎÒÃÇ¿´Ò»ÏÂÆÀ·ÖÊý¾ÝÖи÷µçÓ°±»ÆÀ¼ÛµÄ´ÎÊý¶¼ÓжàÉÙ£º
by_title=data.groupby('Title').size()
by_title.describe() |

ÎÒÃÇÒÔ¶þ·ÖλµãΪ·Ö¸îÏߣ¬È¡³öÆÀ·ÖÊýÁ¿ÔÚ¶þ·ÖλµãÖ®ÉϵĵçÓ°£º
mask=by_title>=250
#×¢Òâby_titleÊÇÒ»¸öSeries
active_titles=by_title.index[mask]
mean_ratings=mean_ratings.loc[active_titles,:] |
ÏÂÃæÁгöÅ®ÐÔ¹ÛÖÚ×îϲ°®µÄµçÓ°£º
top_female_tarings= mean_ratings.sort_values (by='F',ascending=False)[:10]
top_female_tarings |

ÏÂÃæÀ´¿´Ò»ÏÂÄÐÅ®¶ÔÓÚ¸÷ӰƬµÄÆÀ·Ö²îÒ죺
mean_ratings.loc[:,'diff'] =mean_ratings.loc[:,'F']-mean_ratings.loc[:,'M']
sorted_by_diff=mean_ratings.sort_values(by='diff',ascending=False)
sorted_by_diff[:10]
|


½ÓÏÂÀ´ÎÒÃÇͳ¼ÆÄÇЩÆÀ·ÖÕùÒé½Ï´óµÄӰƬ£¬ratingµÄ·½²îÔ½´ó˵Ã÷ÕùÒéÔ½´ó£º
rating_std=data.pivot_table ('Rating',index='Title',aggfunc='std' ).loc[ active_titles,:]
rating_std.sort_values(by= 'Rating',ascending=False)[:10]
|

# US Baby Names
years=range(1880,2017)
subsets=[]
column=['name','gender','number']
for year in years:
path='datasets/babynames/yob{}.txt'.format(year)
df=pd.read_csv(path,header=None,names=column)
df.loc[:,'year']=year #´Ë´¦×¢ÒâyearÕâÒ»ÁеÄֵΪÕûÊýÀàÐÍ
subsets.append(df)
names=pd.concat(subsets,ignore_index=True) #Æ´½Ó¶à¸ödf²¢ÖØÐ±àÅÅÐкÅ
|


ÎÒÃÇÏȸù¾Ý´ËÊý¾ÝÀ´´óÖ¹۲ìÒ»ÏÂÿÄêµÄÄÐÅ®³öÉúÇé¿ö£º
birth_by_gender=pd.pivot_table (names,values='number', index='year', columns='gender',aggfunc='sum')
plt.clf()
birth_by_gender.plot(title='Total births by sex
and year')
plt.show()
|

ÎÒÃÇÔÚÊý¾ÝÖÐÔö¼ÓÒ»¸ö±ÈÀýϵÊý£¬Õâ¸ö±ÈÀýÄÜÏÔʾij¸öÃû×ÖÔÚÕâÒ»ÄêÄÚռij¸öÐÔ±ðµÄ±ÈÀý£º
def add_prop(group):
group.loc[:,'prop']= group.loc[:,'number']/group.loc [:,'number'].sum()
return group
|
names_with_prop=names.groupby(['year','gender']).apply(add_prop)
#×¢ÒâgroupbyÓëpivot_tableµÄÇø±ð
names_with_prop.groupby(['year','gender'])['prop'].sum()[:6]
#ÕýÈ·ÐÔ¼ì²é,×¢ÒâgroupbyÓëpivot_tableµÄÇø±ð
|

ÏÂÃæÈ¡³ö°´yearÓëgender·Ö×éºóµÄ×îÊÜ»¶ÓµÄǰ100¸öÃû×Ö£º
def get_top(group,n=100):
return group.sort_values(by='number',ascending=False)[:n]
|
groupby_obj=names_with_prop.groupby(['year','gender'])
top100=groupby_obj.apply(get_top)
top100.reset_index(drop=True,inplace=True) #¶ªÆúÒò·Ö×é²úÉúµÄÐÐË÷Òý
top100[:5]
|

½ÓÏÂÀ´ÎÒÃÇʹÓÃÕâЩ×î³£¼ûµÄÃû×ÖÀ´×ö¸üÉîÈëµÄ·ÖÎö£º
total_birth=pd.pivot_table(top100,values='number', index='year', columns='name')
total_birth.fillna (0,inplace=True) |
ÎÒÃÇѡȡ¼¸¸ö·Ç³£¾ßÓдú±íÐÔµÄÃû×Ö£¬À´¹Û²ìÕâЩÃû×Ö¸ù¾ÝÄê·ÝµÄ±ä»¯Ç÷ÊÆ£º
subset=total_birth.loc[:,['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,title='Number of birth
per year')
plt.show() |

¿ÉÒÔ¿´³öÕ⼸¸öÃû×ÖÔÚÌØ¶¨µÄʱÆÚ³öÏÖÁ˾®ÅçÏÖÏ󣬵«Ô½¿¿½üÏÖÔÚµÄʱ¼ä¶Î£¬ÕâЩÃû×Ö³öÏֵįµÂÊÔ½µÍ£¬Õâ¿ÉÄÜ˵Ã÷¼Ò³¤ÃǸø±¦±¦ÆðÃû×Ö²»ÔÙËæ´óÁ÷¡£ÏÂÃæÀ´ÑéÖ¤Õâ¸öÏë·¨£º
»ù±¾Ë¼ÏëÊÇʹÓÃÃû×ÖÆµÂʵķÖλÊý£¬Êý¾ÝµÄ·ÖλÊýÄÜ´óÖÂÌåÏÖ³öÊý¾ÝµÄ·Ö²¼£¬Èç¹ûÊý¾ÝÔÚijһ¶ÎÌØ±ðÃܼ¯£¬ÔòijÁ½¸ö·ÖλÊý¿Ï¶¨¿¿µÄÌØ±ð½ü£¬»òÕß·ÖλÊýµÄÐòºÅ»áÆ«Àë±ê×¼Öµ·Ç³£Ô¶¡£
ÏÈÒÔÄк¢ÎªÀý£¬È¡Á½¸öÄê·ÝÀ´¼òµ¥ÑéÖ¤ÏÂÒÔÉϲÂÏ룺
boys=top100[top100.loc[:,'gender']=='M']
boys[boys.loc[:,'year']==1940].sort_values (by='prop').loc[:,'prop'].describe() |

ÓÉÉÏÊöÊý¾Ý¿ÉÒÔ¿´µ½£¬propµÄ×î´óֵΪ0.05£¬ËµÃ÷×î³£¼ûµÄÃû×ֵĿɹ۲âÂÊΪ5%£¬¶øÇÒpropµÄ¾ùÖµ´¦ÓÚ[75%,max]Çø¼äÄÚ£¬ËµÃ÷¾ø´ó¶àÊýµÄÐÂÉú¶ù¹²ÏíÒ»¸öºÜСµÄÃû×ֳء£
boys[boys.loc[:,'year'] ==2016].sort_values(by='prop' ).loc[:,'prop'].describe() |

ÔÚ2016Ä꣬propµÄ×î´óÖµ½µµ½ÁË0.01£¬¾ùÖµ´¦ÓÚ[50%,75%]Çø¼äÄÚ£¬Õâ˵Ã÷ÐÂÉú¶ùµÄÈ¡Ãû¸ü¶àÑù»¯ÁË¡£
ÏÂÃæÎÒÃÇÀ´¼ÆËãÕ¼¾ÝÐÂÉú¶ùǰ25%µÄÃû×ÖÊýÁ¿£º
def get_quantile_index(group,q=0.25):
group=group.sort_values(by='prop',ascending=False)
sorted_arr=group.loc[:,'prop'].cumsum().values
index=sorted_arr.searchsorted(0.25)+1 #0ΪÆðʼµÄË÷Òý
return index
|
diversity=top100.groupby (['year','gender']).apply (get_quantile_index)
diversity=diversity.unstack()
|
plt.clf()
diversity.plot(title='number of popular names
in top 25%')
plt.show() |

¿ÉÒÔÃ÷ÏÔ¿´³öʱ¼äÏßÔ½¿¿½üÏÖÔÚ£¬Ç°25%µÄÐÂÉú¶ùÃû×ÖÊýÁ¿Ò²Ô½¶à£¬Õâȷʵ˵Ã÷¼Ò³¤ÃǸø±¦±¦ÆðÃû×Ö¸ü¶àÑù»¯ÁË¡£²¢ÇÒ»¹×¢Ò⵽Ůº¢Ãû×ÖµÄÊýÁ¿×ÜÊǶàÓÚÄк¢¡£
ÏÂÃæ·ÖÎöÃû×ÖµÄ×îºóÒ»¸ö×Öĸ£º
get_last_letter=lambda
x:x[-1]
last_letters=names.loc[:,'name'].map(get_last_letter)
#·µ»ØÒ»¸öSeries
last_letters.name='last_letter'
letter_table=pd.pivot_table (names,values='number' ,index=last_letters,columns= ['gender','year'],aggfunc='sum')
letter_table.fillna(0,inplace=True)
|
È¡³öÈý¸öÄê·ÝÀ´½øÐдÖÂÔ·ÖÎö£º
subset=letter_table.reindex (columns=[1910,1960,2010], level='year')
#ÖØË÷Òý
subset.fillna(0,inplace=True)
letter_prop_subset=subset/subset.sum(axis=0)
|
plt.clf()
fig,axes=plt.subplots(2,1,figsize=(10,8))
letter_prop_subset.loc [:,'M'].plot (kind='bar',rot=0,ax=axes[0],title='Boy')
letter_prop_subset.loc [:,'F'].plot (kind='bar',rot=0,ax=axes[1],title='Girl')
plt.show()
|

´ÓÉÏÃæµÄ´ÖÂÔ·ÖÎö¿ÉÒÔ¿´µ½¼¸¸öÃ÷ÏÔµÄÇé¿ö£º
- ÔÚboyµÄÊý¾ÝÀÒÔ×ÖĸnΪ½áβµÄÃû×ÖÔÚ1960Äêºó³öÏÖÁ˱¬Õ¨Ê½Ôö³¤
- ¶Ôgirl¶øÑÔ£¬×Öĸa½áβµÄÃû×ֽϳ£¼û£¬¶ø×Öĸe½áβµÄÃû×ÖÔòÔ½À´Ô½ÉÙ
ÏÂÃæ·Ö±ðÕë¶ÔboyÓëgirlÌôÑ¡³ö×î³£¼ûµÄÃû×Öβ×Öĸ£¬»æÖƳöÕâЩ×ÖĸÒÔËæÊ±¼äµÄ±ä»¯ÇúÏߣº
letter_prop=letter_table/letter_table.sum(axis=0)
boy_letter=letter_prop.loc[['d','n','y'],'M']
boy_letter_ts=boy_letter.T
girl_letter=letter_prop.loc[['a','e','y'],'F']
girl_letter_ts=girl_letter.T
|
plt.clf()
fig,axes=plt.subplots(2,1,figsize=(10,8))
boy_letter_ts.plot(ax=axes[0],title='Boy')
girl_letter_ts.plot(ax=axes[1],title='Girl')
plt.show()
|

¸ù¾ÝÒ»¸öÓÐȤµÄ·¢ÏÖ£¬±íÃ÷ÓÐЩÄк¢µÄÃû×ÖÕýÖð½¥×ªÏò±»¸ü¶àµÄÅ®º¢Ê¹Ó㬱ÈÈç˵LesleyºÍLeslie£¬ÏÂÃæ¾Íɸѡ³ö°üº¬leslµÄÃû×ÖÀ´ÑéÖ¤Õâ¸ö˵·¨£º
uni_names=names.loc[:,'name'].unique()
#·µ»ØÒ»¸önumpyÊý×é
uni_names=pd.Series(uni_names)
mask=uni_names.str.lower().str.contains('lesl')
#ser->str->ser->str-bool_ser
lesl=uni_names[mask]
|
mask=names.loc[:,'name'].isin(lesl)
lesl_subset=names[mask] |
lesl_table=pd.pivot_table
(lesl_subset, values='number', index='year', columns='gender',
aggfunc='sum')
lesl_table.fillna(0,inplace=True)
lesl_table.loc[:,'M_prop']= lesl_table.loc[:,'M']/lesl_table.sum(axis=1)
lesl_table.loc[:,'F_prop']= lesl_table.loc[:,'F']/lesl_table.sum(axis=1)
|
plt.clf()
lesl_table.loc[:, ['M_prop','F_prop']].plot( style={'M_prop':'k-','F_prop':'k--'})
plt.show()
|

USDA Food Database
db=json.load(open('datasets/usda_food/database.json'))
len(db) |
6636
ÕâÀïÿ¸öÌõÄ¿°üº¬µÄÐÅϢ̫¶à£¬²»¸ø³ö½ØÍ¼ÁË¡£
¿ÉÒÔ¿´µ½Êý¾ÝÖÐÿ¸öÌõÄ¿°üº¬ÒÔÏÂÐÅÏ¢£º
- description
- group
- id
- manufacturer
- nutrients£ºÓªÑø³É·Ö£¬×ÖµäµÄÁбí
- portions
- tags
ÒòΪnutrientsÏîÊÇÒ»¸ö×ÖµäµÄÁÐ±í£¬Èç¹û½«dbÖ±½Óת»¯ÎªdataframeµÄ»°ÕâÒ»Ïî¾Í»á±»¹éµ½Ò»¸öÁÐÖУ¬·Ç³£Óµ¼·¡£ÎªÁ˱ãÓÚÀí½â£¬´´½¨Á½¸ödf£¬Ò»¸ö°üº¬³ýÁËnutrientsÖ®ÍâµÄʳÎïÐÅÏ¢£¬¶øÁíÒ»¸ö°üº¬idÓënutrientsÐÅÏ¢£¬È»ºóÔÙ½«Á½Õ߸ù¾ÝidºÏ²¢¡£
keys=['description','group','id']
food_df=pd.DataFrame(db,columns=keys) |


subsets=[]
for item in db:
id=item['id']
df=pd.DataFrame(item['nutrients'])
df.loc[:,'id']=id
subsets.append(df)
nutrients_df=pd.concat(subsets,ignore_index=True)
nutrients_df.drop_duplicates(inplace=True)
|


¹Û²ìµ½Á½¸ö±íÖгöÏÖÁËͬÑùµÄÁÐË÷Òý£¬ÎªÁ˺ϲ¢±íʱ²»³öÏÖì¶Ü£¬¸ü¸ÄÁÐË÷ÒýÃû³Æ£º
fd_col_map={
'description':'food',
'group':'fd_cat'
}
food_df=food_df.rename(columns=fd_col_map)
nt_col_map={
'description':'nutrient',
'group':'nt_cat'
}
nutrients_df=nutrients_df.rename(columns=nt_col_map)
|
print('{}\n{}'.format (food_df.columns,nutrients_df.columns))
|

data=pd.merge(food_df,nutrients_df,on='id',how='outer')
|

×¢ÒâÕâ¸ö±íÖУ¬Î¨Ò»¾ßÓÐͳ¼ÆÒâÒåµÄÖµÊÇvalueÁУ¬ÆäÓà¶¼ÊÇÃèÊöÐÔÐÅÏ¢¡£
¼ÙÉèÏÖÔÚÐèҪͳ¼ÆÄÄÖÖʳÎïÀà±ðÓµÓеÄÓªÑøÁ¿¾ùÖµ£¬¿ÉÒÔÏȽ«±í¶ÔnutrientÓëfd_cat½øÐзÖ×飬ÔÙ½øÐÐÅÅÐòÊä³ö£º
nt_result=data.loc [:,'value'].groupby ([data.loc[:,'nutrient'], data.loc[:,'fd_cat']]).mean() |
plt.clf()
nt_result.loc ['Protein'].sort_values().plot(kind='barh')
#°´µ°°×Öʺ¬Á¿¾ùÖµ»æÖÆÍ¼ÐÎ
plt.show() |

2012 Federal Election Commission Database
fec=pd.read_csv('datasets/fec/P00000001-ALL.csv',low_memory=False)
#±ÜÃ⾯¸æ |

×¢Òâµ½Êý¾ÝÖÐûÓкòÑ¡ÈËËùÊôµÄµ³ÅÉÕâÒ»ÐÅÏ¢£¬ËùÒÔ¿ÉÒÔ¿¼ÂÇÈËΪ¼ÓÉÏÕâÒ»ÐÅÏ¢¡£Ê×ÏÈͳ¼Æ³öÊý¾ÝÖÐÓжàÉÙλºòÑ¡ÈË£º
fec.loc[:,'cand_nm'].unique() |

nm2pt={
'Bachmann, Michelle': 'Republican',
'Romney, Mitt': 'Republican',
'Obama, Barack': 'Democrat', "Roemer,
Charles E. 'Buddy' III": 'Republican',
'Pawlenty, Timothy': 'Republican',
'Johnson, Gary Earl': 'Republican',
'Paul, Ron': 'Republican',
'Santorum, Rick': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Huntsman, Jon': 'Republican',
'Perry, Rick': 'Republican',
}
fec.loc[:,'cand_pt']=fec.loc[:,'cand_nm'].map(nm2pt)
|
fec.loc[:,'cand_pt'].value_counts()
|

¾Ý˵ÓÐÒ»¸öÏÖÏó£¬ÂÉʦ»áÇãÏòÓÚ¾è¸øÃñÖ÷µ³£¬¶ø¾¼ÃÈËÊ¿»áÇãÏòÓÚ¾è¸ø¹²ºÍµ³£¬ÏÂÃæ¾ÍÀ´ÑéÖ¤Õâһ˵·¨£º
fec.loc[:,'contbr_occupation'].value_counts()[:10]
|

occ_map={
'INFORMATION REQUESTED PER BEST EFFORTS':'UNKNOW',
'INFORMATION REQUESTED':'UNKNOW',
'C.E.O.':'CEO' #ÕâÒ»ÌõÊÇÔÚºóÃæ·ÖÎöÖз¢ÏÖµÄÏî
}
f=lambda x:occ_map.get(x,x) #»ñÈ¡x¶ÔÓ¦µÄvalue,Èç¹ûûÓжÔÓ¦µÄvalueÔò·µ»Øx
|
fec.loc[:,'contbr_occupation'] =fec.loc [:,'contbr_occupation'].map(f)
by_occupation=pd.pivot_table (fec,values ='contb_receipt_amt', index= 'contbr_occupation', columns='cand_pt',aggfunc='sum')
by_occupation.fillna(0,inplace=True)
by_occupation.sample(5)
|


¿´³ö¾èÏ×½ð¶î·Ö²¼µÄ¼«¶È²»Æ½ºâ£¬ÎÒÃÇֻѡ³ö×ÜÊý´óÓÚ5e6µÄÌõÄ¿£º
mask=by_occupation.sum(axis=1)>5e6
over5mm=by_occupation[mask]
over5mm |

plt.clf()
over5mm.plot(kind='barh')
plt.show() |

ÏÂÃæÎÒÃǶÔObama BarackÓëRomney MittµÄÊý¾Ý½øÐзÖÎö£º
mask=fec.loc[:,'cand_nm'].isin (['Obama,
Barack','Romney, Mitt'])
fec_subset=fec[mask] |
¼ÙÉèÐèÒª·Ö±ðͳ¼Æ³ö¶ÔÕâÁ½¸öÈËÖ§³Ö×î´óµÄ¸÷Ö°Òµ£¬¿ÉÒÔÕâÑù×ö£º
def get_top(group,key,n=5):
totals=group.groupby(key)['contb_receipt_amt'].sum()
return totals.nlargest(n) |
grouped=fec_subset.groupby('cand_nm')
grouped.apply(get_top,'contbr_occupation',5) |

ÏÂÃæ¿´¸÷ÖݶÔÁ½È˵ÄÖ§³ÖÇé¿ö£º
by_stat=fec_subset.groupby(['cand_nm','contbr_st'])['contb_receipt_amt'].sum(axes=0)
mask=by_stat>5e6
by_stat=by_stat[mask] |

|