±à¼ÍƼö: |
±¾ÎÄÖ÷Òª½éÉÜ»úÆ÷ѧϰ¿ª·¢Á÷³Ì¡¢ÌØÕ÷¹¤³ÌÊÇʲô£¿ÌØÕ÷³éÈ¡/ÌáÈ¡¡¢ÌØÕ÷Ô¤´¦Àí¡¢ÌØÕ÷½µÎ¬µÈÏà¹ØÄÚÈÝ¡£
±¾ÎÄÀ´×ÔÓÚcsdn£¬ÓÉ»ðÁú¹ûÈí¼þLinda±à¼¡¢ÍƼö¡£ |
|
»úÆ÷ѧϰ¿ª·¢Á÷³Ì¸ÅÀ¨

Êý¾ÝÔ¤´¦Àí£ºÔÚpythonÖÐʹÓÃpandas¿â£¬È磺ȱʧֵ¡¢Òì³£ÖµµÈµÄÊý¾ÝÇåÏ´¡¢Êý¾Ý´¦Àí
PandasÊý¾Ý´¦Àí
ÌØÕ÷¹¤³Ì¸ÅÀ¨
ÌØÕ÷¹¤³Ì£ºÊÇʹÓÃרҵ±³¾°ÖªÊ¶ºÍ¼¼ÇÉ´¦ÀíÊý¾Ý£¬Ê¹µÃÌØÕ÷Öµ£¨×Ô±äÁ¿£©ÄÜÔÚ»úÆ÷ѧϰËã·¨ÉÏ·¢»Ó¸üºÃµÄ×÷ÓõĹý³Ì¡£
pythonµÄsklearn¿â£¬¶ÔÓÚÌØÕ÷¹¤³ÌµÄ´¦ÀíÌṩÁËÇ¿´óµÄ½Ó¿Ú
ÌØÕ÷¹¤³ÌÖ÷Òª°üº¬µÄÄÚÈÝ£º
ÌØÕ÷³éÈ¡/ÌáÈ¡£º½«ÈÎÒâÊý¾Ý£¨Îı¾¡¢Í¼ÏñµÈ£©×ª»¯Îª¿ÉÒÔÓÃ×÷»úÆ÷ѧϰµÄÊý×ÖÌØÕ÷
Îı¾ÀàÐÍ--->ÊýÖµÐÍ£»·ÖÀàÐÍÊý¾Ý£º×Ö·û´®--->ÊýÖµÐÍ
ÌØÕ÷Ô¤´¦Àí
ÌØÕ÷½µÎ¬
1¡¢ÌØÕ÷³éÈ¡/ÌáÈ¡
1.1¡¢ ×ÖµäÌØÕ÷ÌáÈ¡-Àà±ð->one-hot±àÂë
sklearn.feature _extraction.DictVectorizer £¨sparse=True/False..£©
- DictVectorizer.fit_transform(X) X£º×Öµä»òÕß°üº¬×ÖµäµÄµü´úÆ÷·µ»ØÖµ£»·µ»Øsparse¾ØÕó
- DictVectorizer.inverse_transform(X) X:arrayÊý×é»òÕßsparse¾ØÕó·µ»ØÖµ£º×ª»»Ö®Ç°Êý¾Ý¸ñʽ
- DictVectorizer.get_feature_names()
·µ»ØÀà±ðÃû³Æ
'''×ÖµäÌØÕ÷³éȡȡ'''
from sklearn.feature_extraction import DictVectorizer
# 1¡¢Êý¾Ý£º×Öµä»ò×Öµäµü´úÆ÷ÐÎʽ
data=[{"city":"±±¾©","housing_price":250},
{"city":"ÉϺ£","housing_price":260},
{"city":"¹ãÖÝ","housing_price":200}]
#×Öµäµü´úÆ÷
# 2¡¢ÊµÀý»¯Ò»¸öת»»Æ÷Àà
transfer = DictVectorizer (sparse=True)
# 3¡¢µ÷ÓÃfit_transform()
data_new = transfer.fit_transform(data)
print(data_new) #·Ç0ÖµµÄ×ø±ê£¬Öµ
'''
(0, 1) 1.0
(0, 3) 250.0
(1, 0) 1.0
(1, 3) 260.0
(2, 2) 1.0
(2, 3) 200.0
'''
print (transfer.get_feature_names()) #·µ»ØÀà±ðÃû³Æ
# 2¡¢ÊµÀý»¯Ò»¸öת»»Æ÷Àà
transfer = DictVectorizer(sparse=False)
# 3¡¢µ÷ÓÃfit_transform()
data_new = transfer.fit_transform(data)
print(data_new) #¶þάÊý×é
'''
[[ 0. 1. 0. 250.]
[ 1. 0. 0. 260.]
[ 0. 0. 1. 200.]]
'''
print (transfer.get_feature_names()) #·µ»ØÀà±ðÃû³Æ
#['city=ÉϺ£', 'city=±±¾©', 'city=¹ãÖÝ', 'housing_price'] |
1.2¡¢Îı¾ÌØÕ÷ÌáÈ¡
µ¥´Ê¡¢´ÊÓ×÷ÎªÌØÕ÷Öµ
·½·¨1£º sklearn.feature_extraction.text
.CountVectorizer(stop_words=[])£¬
·µ»Ø´ÊÓï³öÏֵĴÎÊý£¬·µ»Ø´ÊƵ¾ØÕó£¬stop_words=[]Í£ÓôÊÁбí
¡¤CountVectorizer.fit_transform£¨X£©X£ºÎı¾»òÕß°üº¬Îı¾×Ö·û´®µÄ¿Éµü´ú¶ÔÏó·µ»ØÖµ£º·µ»Øsparse¾Ø½µ
¡¤CountVectorizer.inverse_transform£¨X£©XarrayÊý×é»òÕßsparse¾ØÕó·µ»ØÖµ£»×ª»»Ö®Ç°Êý¾Ý¸ñ
¡¤CountVectorizer.get_feature_names()
·µ»ØÖµ£»µ¥´ÊÁбí
import pandas
as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
data=["Maybe it was better to just really
enjoy life. this is the life","ÏíÊÜÉú»î£¬Ë³Æä×ÔÈ»¡£Õâ¾ÍÊÇÉú»î"]
transfer = CountVectorizer() #ʵÀý»¯Ò»¸öת»»Æ÷Àà
data_new = transfer.fit_transform (data) #µ÷ÓÃfit_transform()
#print(data_new)
print(transfer.get_feature_names())
print(data_new.toarray())
#¹¹½¨³ÉÒ»¸ö¶þά±í£º
data=pd.DataFrame(data_new.toarray(), columns=transfer.get_feature_names())
display(data)
#Æäʵ·¢ÏÖÖÐÎĵķִÊЧ¹û²¢²»ºÃ£¬ÔÒò£º·Ö´ÊÔÀí£¬¸ù¾Ý¿Õ¸ñ¡¢±êµã·ûºÅµÈÌØÊâ×Ö·ûÇÐ·Ö |

ÖÐÎÄÎı¾µÄ·Ö´Ê
ÐèÒª½èÖújieba·Ö´Ê¿â
import pandas
as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import jieba #(jieba ·Ö´Ê£¬°²×°£º pip install jieba)
data=[u'½ñÄê¹úÇì½Ú´òËãÈ¥º£Äϵº¶È¼Ù',"ÏíÊÜÉú»î£¬Ë³Æä×ÔÈ»¡£Õâ¾ÍÊÇÉú»î!"]
#·Ö´Ê
cut_data=[]
for s in data:
cut_s=jieba.cut(s)
l_cut_s=' '.join(list(cut_s))
cut_data.append(l_cut_s)
print(cut_data)
#ͳ¼ÆÌØÕ÷´Ê³öÏÖ´ÎÊý
transfer = CountVectorizer(stop_words=["´òËã","¾ÍÊÇ"])
#ʵÀý»¯Ò»¸öת»»Æ÷Àà,
# stop_words=["´òËã","¾ÍÊÇ"],È¥³ý²»ÏëÒªµÄ´Ê
data_new = transfer.fit_transform (cut_data) #µ÷ÓÃfit_transform()
#print(data_new)
print(transfer.get_feature_names())
print(data_new.toarray())
#¹¹½¨³ÉÒ»¸ö¶þά±í£º
data=pd.DataFrame(data_new.toarray(), columns=transfer.get_feature_names())
display(data) |

·½·¨2£ºsklearn.feature_extraction.text.
TfidfVectorizer(stop_words=None)
¡¤TF-IDFµÄÖ÷Ҫ˼ÏëÊÇ£ºÈç¹ûij¸ö´Ê»ò¶ÌÓïÔÚһƪÎÄÕÂÖгöÏֵĸÅÂʸߣ¬²¢ÇÒÔÚÆäËûÎÄÕÂÖкÜÉÙ³öÏÖ£¬ÔòÈÏΪ´Ë´Ê»òÕß¶ÌÓï¾ßÓкܺõÄÀà±ðÇø·ÖÄÜÁ¦£¬ÊʺÏÓÃÀ´·ÖÀà¡£·µ»ØµÄÖµÔ½¸ß£¬ÕÒµ½ÕâÆªÎÄÕ¸üΪ¹Ø¼ü´Ó´Ê¡£
¡¤TF-IDF×÷ÓãºÓÃÒÔÆÀ¹ÀÒ»×ִʶÔÓÚÒ»¸öÎļþ¼¯»òÒ»¸öÓïÁÏ¿âÖÐµÄÆäÖÐÒ»·ÝÎļþµÄÖØÒª³Ì¶È¡£

import pandas
as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba #(jieba ·Ö´Ê£¬°²×°£ºpip install jieba)
#Êý¾Ý
data=["ÒÆ¶¯¹²Ïí£¬¹²ÏíÆû³µ£¬¹²Ïí¾¼Ã£¬¹²Ïíµ¥³µ","²Æ¾À¸Ä¿£¬²Æ¾Õþ²ß£¬¾¼ÃÕþ²ß£¬¹²Ïí¾¼Ã"]
#·Ö´Ê
cut_data=[]
for s in data:
cut_s=jieba.cut(s)
l_cut_s=' '.join(list(cut_s))
cut_data.append(l_cut_s)
print(cut_data)
#TF-IDF
transfer = TfidfVectorizer() #ʵÀý»¯Ò»¸öת»»Æ÷Àà
data_new = transfer.fit_transform (cut_data) #µ÷ÓÃfit_transform()
#print(data_new)
print(transfer.get_feature_names())
print(data_new.toarray())
#¹¹½¨³ÉÒ»¸ö¶þά±í£º
data=pd.DataFrame (data_new.toarray(), columns=transfer.get_feature_names())
display(data) |

2¡¢ÌØÕ÷Ô¤´¦Àí
ͨ¹ýһЩת»»º¯Êý½«ÌØÕ÷Êý¾Ýת»»³É¸ü¼ÓÊʺÏË㷨ģÐ͵ÄÌØÕ÷Êý¾Ý¡£
°üº¬ÄÚÈÝ£º
ÊýÖµÐÍÊý¾ÝµÄÎÞÁ¿¸Ö»¯£º
¹éÒ»»¯
±ê×¼»¯
¶ÔÓÚ¹éÒ»»¯À´Ëµ£ºÈç¹û³öÏÖÒì³£µã£¬Ó°ÏìÁË×î´óÖµºÍ×îСֵ£¬ÄÇô½á¹ûÏÔÈ»»á·¢Éú¸Ä±ä¡£
¶ÔÓÚ±ê×¼»¯À´Ëµ£ºÈç¹û³öÏÖÒì³£µã£¬ÓÉÓÚ¾ßÓÐÒ»¶¨Êý¾ÝÁ¿£¬ÉÙÁ¿µÄÒì³£µã¶ÔÓÚÆ½¾ùÖµµÄÓ°Ïì²¢²»´ó£¬´Ó¶ø·½²î¸Ä±ä½ÏС¡£
ÌØÕ÷Ô¤´¦ÀíAPI£º
sklearn.preprocessing
ΪʲôÎÒÃÇÒª½øÐйéÒ»»¯/±ê×¼»¯£¿
¡¤ÌØÕ÷µÄµ¥Î»»òÕß´óСÏà²î½Ï´ó£¬»òÕßÄ³ÌØÕ÷µÄ·½²îÏà±ÈÆäËûµÄÌØÕ÷Òª´ó³ö¼¸¸öÊýÁ¿¼¶£¬ÈÝÒ×Ó°Ï죨֧Å䣩Ŀ±ê½á¹û£¬Ê¹µÃһЩËã·¨ÎÞ·¨Ñ§Ï°µ½ÆäËüµÄÌØÕ÷¡£


2.1¡¢¹éÒ»»¯£º
¹éÒ»»¯È±µã£ºÒì³£Öµ£¨È±Ê§µÈ£©£¬Èç¹û×î´óÖµºÍ×îСֵÒ쳣ʱ²»ÄÜ´¦Àí


# ¹éÒ»»¯´¦Àí
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
#¹éÒ»»¯¿â
# 1.»ñÈ¡Êý¾Ý
df = pd.read_csv(r"E:\Normalization.txt", sep="
",encoding="utf-8")
display(df.sample(3))
x = df.iloc[:,:3]
display(x.head(3))
#2.ʵÀý»¯Ò»¸öת»»Æ÷Àà
transfer = MinMaxScaler(feature_range=(10,20))
#ʵÀý»¯Ò»¸öת»»Æ÷Àà
# feature_range=(10,20),ÉèÖùéÒ»»¯ºóµÄÊý¾Ýȡֵ·¶Î§
#3.#µ÷ÓÃfit_transform()
xi = transfer.fit_transform(x) #µ÷ÓÃfit_transform()
print(xi)
#4¡¢×ª»¯Îª¶þά±í
data = pd.DataFrame (xi,columns=x.columns)
data["y"] = df['y']
display(data) |

2.2¡¢±ê×¼»¯£º
±ê×¼»¯ÓÐЧµØ±ÜÃâÁ˹éÒ»»¯µÄȱµã£¨×î´óÖµºÍ×îСֵ´øÀ´µÄÓ°Ï죩

# ±ê×¼»¯´¦Àí£¨×ª»¯Îª¾ùֵΪ0£¬±ê×¼²îΪ1
¸½½üµÄÖµ£©
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
#¹éÒ»»¯¿â
# 1.»ñÈ¡Êý¾Ý
df = pd.read_csv(r"E:\Normalization.txt", sep="
",encoding="utf-8")
display(df.sample(3))
x = df.iloc[:,:3]
display(x.head(3))
#2.ʵÀý»¯Ò»¸öת»»Æ÷Àà
transfer = StandardScaler() #ʵÀý»¯Ò»¸öת»»Æ÷Àà
#3.#µ÷ÓÃfit_transform()
xi = transfer.fit_transform(x) #µ÷ÓÃfit_transform()
print(xi)
#4¡¢×ª»¯Îª¶þά±í
data = pd.DataFrame(xi,columns=x.columns)
data["y"] = df['y']
display(data.tail(3)) |

3¡¢ÌØÕ÷½µÎ¬
½µÎ¬ÊÇÖ¸ÔÚijЩÏÞ¶¨Ìõ¼þÏ£¬½µµÍËæ»ú±äÁ¿£¨ÌØÕ÷±äÁ¿¡¢×Ô±äÁ¿£©¸öÊý£¬µÃµ½Ò»×é¡°²»Ïà¹Ø¡±Ö÷±äÁ¿µÄ¹ý³Ì
Ä¿µÄ£º½µÎ¬µÃµ½µÄÌØÕ÷±äÁ¿Ö®¼äÊDz»Ïà¹ØµÄ£¬È¥³ýÈßÓàÌØÕ÷±äÁ¿¡£
·½·¨£ºÌØÕ÷Ñ¡Ôñ¡¢Ö÷³É·Ö·ÖÎö
3.1¡¢ÌØÕ÷Ñ¡Ôñ
Êý¾ÝÖаüº¬ÈßÓà»òÏà¹Ø±äÁ¿£¨»ò³ÆÌØÕ÷¡¢ÊôÐÔ¡¢Ö¸±êµÈ£©£¬Ö¼ÔÚ´ÓÔÓÐÌØÕ÷ÖÐÕÒ³öÖ÷ÒªÌØÕ÷¡£
·½·¨£º
Filter£¨¹ýÂËʽ£©£ºÖ÷Ҫ̽¾¿ÌØÕ÷±¾ÉíÌØµã¡¢ÌØÕ÷ÓëÌØÕ÷ºÍÄ¿±êÖµÖ®¼ä¹ØÁª
·½²îÑ¡Ôñ·¨£ºµÍ·½²îÌØÕ÷¹ýÂË
Ïà¹ØÏµÊý
ɾ³ýµÍ·½²îµÄÒ»Ð©ÌØÕ÷£¬Ç°Ãæ½²¹ý·½²îµÄÒâÒå¡£ÔÙ½áºÏ·½²îµÄ´óСÀ´¿¼ÂÇÕâ¸ö·½Ê½µÄ½Ç¶È¡£
Ïà¹ØÐԸߵÄÌØÕ÷±äÁ¿½øÐнµÎ¬£¬¿ÉÒÔÈ¥³ý²»ÖØÒªÌØÕ÷±äÁ¿»òÕß´Óй¹ÔìеÄÌØÕ÷±äÁ¿(È磺¼ÓȨÇóºÍÖ÷³É·Ö·ÖÎö½µÎ¬¶È)
Embedded£¨Ç¶Èëʽ£©£ºËã·¨×Ô¶¯Ñ¡ÔñÌØÕ÷£¨ÌØÕ÷ÓëÄ¿±êÖµÖ®¼äµÄ¹ØÁª£©
¾ö²ßÊ÷£ºÐÅÏ¢ìØ¡¢ÐÅÏ¢ÔöÒæ
ÕýÔò»¯£ºL1¡¢L2?
Éî¶Èѧϰ£º¾í»ýµÈ
·½²îÑ¡Ôñ·¨µÄpythonÄ£¿é£ºsklearn.feature_selection
µÍ·½²îÌØÕ÷¹ýÂË£º
ɾ³ýµÍ·½²îµÄÒ»Ð©ÌØÕ÷£¬Ç°Ãæ½²¹ý·½²îµÄÒâÒå¡£ÔÙ½áºÏ·½²îµÄ´óСÀ´¿¼ÂÇÕâ¸ö·½Ê½µÄ½Ç¶È¡£
¡¤ÌØÕ÷·½²îС£ºÄ³¸öÌØÕ÷¶àÑù±¾µÄÖµ±È½ÏÏà½ü
¡¤ÌØÕ÷·½²î´ó£ºÄ³¸öÌØÕ÷ºÜ¶àÑù±¾µÄÖµ¶¼Óвî±ð

#¹¹ÔìÒ»×éÌØÕ÷ÖµÊý¾Ý
import numpy as np
import pandas as dp
from scipy import stats
np.random.seed(10) # ÏÈÉú³ÉÒ»¸öÖÖ×ÓÉú³ÉÆ÷£¬ÒÔºóÉú³ÉµÄËæ»úÊý¾ÍÊÇÒ»ÑùµÄ£¬²ÎÊýΪÈÎÒâÊý×Ö
x1 = stats.norm.rvs (loc=5,scale=0.0,size=500)
#¾ùֵΪ5£¬±ê×¼²îΪ0.0µÄÕý̬·Ö²¼£¬Ëæ»úËæ»úÉú³É500¸öÑù±¾
x2 = stats.t.rvs(10, size=500) #Éú³É·þ´Ót·Ö²¼£¬×ÔÓɶÈΪ10µÄ500¸öËæ»úÊý£¨rvsÉú³ÉËæ»úÊý£©
x3 = stats.norm.rvs(loc=1,scale=0.5,size=500)
#¾ùֵΪ1£¬±ê×¼²îΪ0.5µÄÕý̬·Ö²¼£¬Ëæ»úËæ»úÉú³É500¸öÑù±¾
x4 = np.random.rand(500)
x5 = 10*x4+5 + stats.norm.rvs (loc=0,scale=0.0,size=500)
#stats.norm.rvs(loc=0,scale=0.0,size=500)ÓÃÀ´¹¹Ôì²Ð²î
x6 = -5*x2 +1 + stats.norm.rvs (loc=0,scale=0.0,size=500)
#ÔÝʱ²»¹¹ÔìÄ¿±êÖµy
data = pd.DataFrame ({"x1":x1,"x2":x2,"x3":x3, "x4":x4,"x5":x5,"x6":x6})
display(data.sample(5)) |

# µÍ·½²îÌØÕ÷¹ýÂË£º
from sklearn.feature_selection import VarianceThreshold
#1¡¢»ñÈ¡Êý¾Ý£¬É϶δúÂë¹¹ÔìµÄÒ»×éÌØÕ÷ÖµÊý¾Ýdata
#2.ʵÀý»¯Ò»¸öת»»Æ÷Àà
transfer = VarianceThreshold (threshold=0.25) #ʵÀý»¯Ò»¸öת»»Æ÷Àà
#threshold=0.25:ÓÃÀ´ÉèÖ÷½²îµÄ·§Öµ£¬ÕâÀï<=0.25µÄ·µ»ØFalse£¬·½²îµÄãÐÖµ¸ù¾Ýʵ¼ÊÒµÎñÐèÇóÉ趨£¬Ä¬ÈÏΪ0.0
#3.#µ÷ÓÃfit_transform()
xi = transfer.fit_transform(data) #µ÷ÓÃfit_transform()
print(xi,"\n",transfer.get_support())
#4.Êä³ö¹ýÂ˺óµÄÌØÕ÷±äÁ¿
data01=data.iloc [:,[False,True,True,False,True,True]]
display(data01.head(3))
#ËäÈ»x4ÊÇÒ»×éËæ»úÊý£¬µ«·½²î<0.25,ËùÒÔÒ²±»¹ýÂ˵ôÁË |

Ïà¹ØÐÔ
Ïà¹ØÏµÊý£¨-1£¬1£©
Ïà¹ØÐԸߵÄÌØÕ÷±äÁ¿½øÐнµÎ¬£¬¿ÉÒÔÈ¥³ý²»ÖØÒªÌØÕ÷±äÁ¿»òÕß´Óй¹ÔìеÄÌØÕ÷±äÁ¿(È磺¼ÓȨÇóºÍÖ÷³É·Ö·ÖÎö½µÎ¬¶È)

pearson:»ý²îÏà¹ØÏµÊý£¬·´Ó¦Á½¸ö±äÁ¿Ö®¼äµÄÏßÐÔÏà¹ØÐÔ
spearman£ºµÈ¼¶Ïà¹ØÏµÊý£¨Ranked data£©£¬¼ÆË㹫ʽºÍpearsonÒ»Ñù£¨µ«ÏȶÔËùÓбäÁ¿½øÐÐÅÅÐò£¬ÔÚ×öÏßÐÔÏà¹Ø£©
- - - ³£ÓÃ
Kendall¡¯s Tau:·Ç²ÎÊýµÈ¼¶Ïà¹ØÏµÊý
tau=(P-Q)/sqrt((P+Q+T)*(P+Q+U))
P:ͬ²½Êý¾Ý¶ÔÊý£¬Q:Òì²½£¬T:tie in x£¬U:tie in y
ÒìͬºÍÑ¡Óãº
spearmanÏà¹ØÏµÊý£ºÏȶÔËùÓбäÁ¿½øÐÐÅÅÐò£¬ÔÚ×öÏßÐÔÏà¹Ø¡£Óëpearson²»Í¬£¬²»¼ÙÉè±äÁ¿ÎªÕý̬·Ö²¼
Èç¹ûÓÐÏßÐÔģʽ£¬Ò²ÓÐһЩÀëÉ¢µã£¬spearmanÏßÐÔÏà¹ØÏµÊýÒª´óһЩ£¬ÒòΪÀëÉ¢µãÆÆ»µÁËÏßÐÔÏà¹ØÐÔ£¬µ«ÊǶÔrankÅÅÐòÓ°Ï첻̫´ó
pearsonÖ»ÄÜ´¦ÀíÁ½×éÊý¾Ý£¬spearman¿ÉÒÔ´¦Àí¶à×éÐòÁÐ
Kendall¡¯s tau-b£¨¿ÏµÂ¶û£©µÈ¼¶Ïà¹ØÏµÊý£ºÓÃÓÚ·´Ó³·ÖÀà±äÁ¿Ïà¹ØÐÔµÄÖ¸±ê£¬ÊÊÓÃÓÚÁ½¸ö·ÖÀà±äÁ¿¾ùΪÓÐÐò·ÖÀàµÄÇé¿ö¡£¶ÔÏà¹ØµÄÓÐÐò±äÁ¿½øÐзDzÎÊýÏà¹Ø¼ìÑ飻ȡֵ·¶Î§ÔÚ-1-1Ö®¼ä£¬´Ë¼ìÑéÊʺÏÓÚÕý·½Ðαí¸ñ£»
¼ÆËã»ý¾àpearsonÏà¹ØÏµÊý£¬Á¬ÐøÐÔ±äÁ¿²Å¿É²ÉÓÃ;¼ÆËãSpearmanÖÈÏà¹ØÏµÊý£¬ÊʺÏÓÚ¶¨Ðò±äÁ¿»ò²»Âú×ãÕý̬·Ö²¼¼ÙÉèµÄµÈ¼ä¸ôÊý¾Ý;
¼ÆËãKendallÖÈÏà¹ØÏµÊý£¬ÊʺÏÓÚ¶¨Ðò±äÁ¿»ò²»Âú×ãÕý̬·Ö²¼¼ÙÉèµÄµÈ¼ä¸ôÊý¾Ý¡£
¼ÆËãÏà¹ØÏµÊý£ºµ±×ÊÁϲ»·þ´ÓË«±äÁ¿Õý̬·Ö²¼»ò×ÜÌå·Ö²¼Î´Öª£¬»òÔʼÊý¾ÝÓõȼ¶±íʾʱ£¬ÒËÓà spearman»òkendallÏà¹Ø¡£
×ܽ᣺ÔÚ´ó¶àÊýÇé¿öÏÂÑ¡ÓÃspearman¼ìÑé¼´¿É
# pearsonÖ»ÄܼìÑéÁ½×éÊý¾ÝµÄÏà¹ØÐÔ
from scipy.stats import pearsonr
import pandas as pd
#1.Êý¾ÝʹÓÃÉ϶δúÂëÉú³ÉµÄdata01
print("x2-->x3:",stats.pearsonr (data01['x2'],data01['x3']))
print("x2-->x5:",stats.pearsonr (data01['x2'],data01['x5']))
print("x2-->x6:",stats.pearsonr (data01['x2'],data01['x6']))
print("x3-->x5:",stats.pearsonr (data01['x3'],data01['x5']))
print("x3-->x6:",stats.pearsonr (data01['x3'],data01['x6']))
print("x5-->x6:",stats.pearsonr(data01['x5'],data01['x6']))
# #·µ»ØÏßÐÔÏà¹Ø¶ÈºÍPÖµ
'''
x2-->x3: (-0.011882657685256122, 0.7909719957310081)
#²»Ïà¹Ø
x2-->x5: (0.04534181148083765, 0.31160663007500405)
#²»Ïà¹Ø
x2-->x6: (-1.0, 0.0) #Ïà¹Ø£¬¿ÉÒÔÈ¥³ý²»ÖØÒªÌØÕ÷±äÁ¿»òÕß´Óй¹ÔìеÄÌØÕ÷±äÁ¿(È磺¼ÓȨÇóºÍÖ÷³É·Ö·ÖÎö½µÎ¬¶È)
x3-->x5: (-0.07689352056728531, 0.08586381452347397)
#²»Ïà¹Ø
x3-->x6: (0.011882657685256105, 0.7909719957310081)
#²»Ïà¹Ø
x5-->x6: (-0.04534181148083762, 0.31160663007500405)
#²»Ïà¹Ø
''' |
# spearman£ºµÈ¼¶Ïà¹ØÏµÊý£¨Ranked
data£©¼ìÑé
from scipy.stats import spearmanr
import pandas as pd
correlation,pvalue=stats.spearmanr (data01) #¿ÉÒÔÖ±½Ó´«Èë¶þά±í
#·½±ã¿´¿ÉÒÔ¹¹ÔìΪ¶þά±í
correlation= pd.DataFrame(correlation,index=data01.columns, columns=data01.columns)
pvalue= pd.DataFrame(pvalue,index=data01.columns, columns=data01.columns)
display(correlation,pvalue) |

3.2¡¢Ö÷³É·Ö·ÖÎö
Ó¦ÓÃPCAʵÏÖÌØÕ÷µÄ½µÎ¬
¡¤¶¨Ò壺¸ßάÊý¾Ýת»¯ÎªµÍάÊý¾ÝµÄ¹ý³Ì£¬Ôڴ˹ý³ÌÖпÉÄÜ»áÉáÆúÔÓÐÊý¾Ý¡¢´´ÔìеıäÁ¿
¡¤×÷ÓãºÊÇÊý¾ÝάɢѹËõ£¬¾¡¿ÉÄܽµµÍÔÊý¾ÝµÄάÊý£¨¸´ÔÓ¶È£©£¬ËðʧÉÙÁ¿ÐÅÏ¢¡£
¡¤Ó¦Ó㺻عé·ÖÎö»òÕß¾ÛÀà·ÖÎöµ±ÖÐ
APA:
¡¤sklearn.decomposition.PCA £¨n_components=None£©
- ½«Êý¾Ý·Ö½âΪ½ÏµÍάÊý¿Õ¼ä
- n_components£º
¡¤Ð¡Êý£º±íʾ±£Áô°Ù·ÖÖ®¶àÉÙµÄÐÅÏ¢
¡¤ÕûÊý£º¼õÉÙµ½¶àÉÙÌØÕ÷
- PCA.fit_transform£¨X£©X :numpy array¸ñʽµÄÊý¾Ý[n_samples£¬n_features]
- ·µ»ØÖµ£º×ª»»ºóÖ¸¶¨Î¬¶ÈµÄarray
# Ö÷³É·Ö·ÖÎö£¬±£Áôn.n%
µÄÐÅÏ¢
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
# 1¡¢Êý¾Ý£ºÊ¹ÓÃÉÏÃæ´úÂëÉú³ÉµÄdata01
display(data01.head(3))
#2.ʵÀý»¯Ò»¸öת»»Æ÷Àà
transfer = PCA(n_components= 0.9) #ʵÀý»¯Ò»¸öת»»Æ÷Àà
# n_components£º ¡¤Ð¡Êý£º±íʾ±£Áô°Ù·ÖÖ®¶àÉÙµÄÐÅÏ¢ ¡¤ÕûÊý£º¼õÉÙµ½¶àÉÙÌØÕ÷
#3.#µ÷ÓÃfit_transform()
xi = transfer.fit_transform (data01) #µ÷ÓÃfit_transform()
#²é¿´¹¹³ÉÐµļ¸¸ö±äÁ¿,²é¿´µ¥¸ö±äÁ¿µÄ·½²î¹±Ï×ÂÊ
print (xi.shape,transfer.explained_variance_ratio_)
#4.Êä³öй¹Ôì³öÀ´µÄÖ÷³É·Ö±äÁ¿
Fi=[ ]
for i in range(1,xi.shape[1]+1):
F="F" + str(i)
Fi.append(F)
data02 = pd.DataFrame (xi,columns=Fi)
display(data02.head(3)) |

# Ö÷³É·Ö·ÖÎö£¬±£Áôn¸ö±äÁ¿µÄÐÅÏ¢
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
# 1¡¢Êý¾Ý£ºÊ¹ÓÃÉÏÃæ´úÂëÉú³ÉµÄdata01
display(data01.head(3))
#2.ʵÀý»¯Ò»¸öת»»Æ÷Àà
transfer = PCA (n_components=3) #ʵÀý»¯Ò»¸öת»»Æ÷Àà
# n_components£º ¡¤Ð¡Êý£º±íʾ±£Áô°Ù·ÖÖ®¶àÉÙµÄÐÅÏ¢ ¡¤ÕûÊý£º¼õÉÙµ½¶àÉÙÌØÕ÷
#3.#µ÷ÓÃfit_transform()
xi = transfer.fit_transform (data01) #µ÷ÓÃfit_transform()
print (xi.shape,transfer.explained_variance_ratio_)
#²é¿´µ¥¸ö±äÁ¿µÄ·½²î¹±Ï×ÂÊ
#4.Êä³öй¹Ôì³öÀ´µÄÖ÷³É·Ö±äÁ¿
Fi=[ ]
for i in range(1,xi.shape[1]+1):
F="F" + str(i)
Fi.append(F)
data02 = pd.DataFrame (xi,columns=Fi)
display(data02.head(3)) |

4¡¢ÌØÕ÷¹¤³ÌÖ®Êý¾Ý̽Ë÷
µ¥±äÁ¿µÄÑù±¾·Ö²¼¼ìÑé
̽Ë÷±äÁ¿Ö®¼äµÄ¹ØÏµ
ÌØÕ÷¹¤³Ì×öÍêÒÔºó²ÅÄÜÓÐЧµØ·¢ÏÖÑù±¾ÊÇ·ñÊʺÏ×ö½¨Ä££¨»úÆ÷ѧϰ£©£¬È»ºóÑ¡Ôñ»úÆ÷ѧϰºÏÊʵÄËã·¨£¬¼°²»¶ÏÆÀ¹ÀºÍµ÷ÓÅ¡£
|