Hive
ÊÇ»ùÓÚHadoop ¹¹½¨µÄÒ»Ì×Êý¾Ý²Ö¿â·ÖÎöϵͳ£¬ËüÌṩÁ˷ḻµÄSQL²éѯ·½Ê½À´·ÖÎö´æ´¢ÔÚHadoop ·Ö²¼Ê½ÎļþϵͳÖеÄÊý¾Ý£¬¿ÉÒÔ½«½á¹¹»¯µÄÊý¾ÝÎļþÓ³ÉäΪһÕÅÊý¾Ý¿â±í£¬²¢ÌṩÍêÕûµÄSQL²éѯ¹¦ÄÜ£¬¿ÉÒÔ½«SQLÓï¾äת»»ÎªMapReduceÈÎÎñ½øÐÐÔËÐУ¬Í¨¹ý×Ô¼ºµÄSQL
È¥²éѯ·ÖÎöÐèÒªµÄÄÚÈÝ£¬ÕâÌ×SQL ¼ò³ÆHive SQL£¬Ê¹²»ÊìϤmapreduce µÄÓû§ºÜ·½±ãµÄÀûÓÃSQL
ÓïÑÔ²éѯ£¬»ã×Ü£¬·ÖÎöÊý¾Ý¡£¶ømapreduce¿ª·¢ÈËÔ±¿ÉÒ԰ѼºÐ´µÄmapper ºÍreducer ×÷Ϊ²å¼þÀ´Ö§³ÖHive
×ö¸ü¸´ÔÓµÄÊý¾Ý·ÖÎö¡£
ËüÓë¹ØÏµÐÍÊý¾Ý¿âµÄSQL ÂÔÓв»Í¬£¬µ«Ö§³ÖÁ˾ø´ó¶àÊýµÄÓï¾äÈçDDL¡¢DML
ÒÔ¼°³£¼ûµÄ¾ÛºÏº¯Êý¡¢Á¬½Ó²éѯ¡¢Ìõ¼þ²éѯ¡£HIVE²»ÊʺÏÓÃÓÚÁª»úonline)ÊÂÎñ´¦Àí£¬Ò²²»Ìṩʵʱ²éѯ¹¦ÄÜ¡£Ëü×îÊʺÏÓ¦ÓÃÔÚ»ùÓÚ´óÁ¿²»¿É±äÊý¾ÝµÄÅú´¦Àí×÷Òµ¡£
HIVEµÄÌØµã£º¿ÉÉìËõ£¨ÔÚHadoopµÄ¼¯ÈºÉ϶¯Ì¬µÄÌí¼ÓÉ豸£©£¬¿ÉÀ©Õ¹£¬ÈÝ´í£¬ÊäÈë¸ñʽµÄËÉÉ¢ñîºÏ¡£
Hive µÄ¹Ù·½ÎĵµÖжԲéѯÓïÑÔÓÐÁ˺ÜÏêϸµÄÃèÊö£¬Çë²Î¿¼£ºhttp://wiki.apache.org/hadoop/Hive/LanguageManual
£¬±¾ÎĵÄÄÚÈݴ󲿷ַÒë×Ô¸ÃÒ³Ãæ£¬ÆÚ¼ä¼ÓÈëÁËһЩÔÚʹÓùý³ÌÖÐÐèҪעÒâµ½µÄÊÂÏî¡£
1. DDL ²Ù×÷
½¨±í
ɾ³ý±í
Ð޸ıí½á¹¹
´´½¨£¯É¾³ýÊÓͼ
´´½¨Êý¾Ý¿â
ÏÔʾÃüÁî
1.2 ½¨±í£º
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name [(col_name data_type [COMMENT col_comment], ...)] [COMMENT table_comment] [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] [ROW FORMAT row_format] [STORED AS file_format] [LOCATION hdfs_path] |
CREATE TABLE ´´½¨Ò»¸öÖ¸¶¨Ãû×ÖµÄ±í¡£Èç¹ûÏàͬÃû×ֵıíÒѾ´æÔÚ£¬ÔòÅ׳öÒì³££»Óû§¿ÉÒÔÓÃ
IF NOT EXIST Ñ¡ÏîÀ´ºöÂÔÕâ¸öÒì³£
EXTERNAL ¹Ø¼ü×Ö¿ÉÒÔÈÃÓû§´´½¨Ò»¸öÍⲿ±í£¬ÔÚ½¨±íµÄͬʱָ¶¨Ò»¸öÖ¸Ïòʵ¼ÊÊý¾ÝµÄ·¾¶£¨LOCATION£©
LIKE ÔÊÐíÓû§¸´ÖÆÏÖÓеıí½á¹¹£¬µ«ÊDz»¸´ÖÆÊý¾Ý
COMMENT¿ÉÒÔΪ±íÓë×Ö¶ÎÔö¼ÓÃèÊö
ROW FORMAT
DELIMITED [FIELDS TERMINATED BY char] [COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char] [LINES TERMINATED
BY char]
| SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value,
property_name=property_value, ...)]
|
Óû§ÔÚ½¨±íµÄʱºò¿ÉÒÔ×Ô¶¨Òå SerDe »òÕßʹÓÃ×Ô´øµÄ SerDe¡£Èç¹ûûÓÐÖ¸¶¨
ROW FORMAT »òÕß ROW FORMAT DELIMITED£¬½«»áʹÓÃ×Ô´øµÄ SerDe¡£ÔÚ½¨±íµÄʱºò£¬Óû§»¹ÐèҪΪ±íÖ¸¶¨ÁУ¬Óû§ÔÚÖ¸¶¨±íµÄÁеÄͬʱҲ»áÖ¸¶¨×Ô¶¨ÒåµÄ
SerDe£¬Hive ͨ¹ý SerDe È·¶¨±íµÄ¾ßÌåµÄÁеÄÊý¾Ý¡£
STORED AS
SEQUENCEFILE
| TEXTFILE
| RCFILE
| INPUTFORMAT input_format_classname OUTPUTFORMAT
output_format_classname
|
Èç¹ûÎļþÊý¾ÝÊÇ´¿Îı¾£¬¿ÉÒÔʹÓà STORED AS TEXTFILE¡£Èç¹ûÊý¾ÝÐèҪѹËõ£¬Ê¹ÓÃ
STORED AS SEQUENCE ¡£
1.3 ´´½¨¼òµ¥±í£º
hive> CREATE TABLE pokes (foo INT, bar STRING); |
1.4 ´´½¨Íⲿ±í£º
CREATE EXTERNAL TABLE page_view(viewTime INT, userid BIGINT,
page_url STRING, referrer_url STRING,
ip STRING COMMENT 'IP Address of the User',
country STRING COMMENT 'country of origination')
COMMENT 'This is the staging page view table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\054'
STORED AS TEXTFILE
LOCATION '<hdfs_location>'; |
1.5 ½¨·ÖÇø±í
CREATE TABLE par_table(viewTime INT, userid BIGINT,
page_url STRING, referrer_url STRING,
ip STRING COMMENT 'IP Address of the User')
COMMENT 'This is the page view table'
PARTITIONED BY(date STRING, pos STRING)
ROW FORMAT DELIMITED ¡®\t¡¯
FIELDS TERMINATED BY '\n'
STORED AS SEQUENCEFILE; |
1.6 ½¨Bucket±í
CREATE TABLE par_table(viewTime INT, userid BIGINT, page_url STRING, referrer_url STRING, ip STRING COMMENT 'IP Address of the User') COMMENT 'This is the page view table' PARTITIONED BY(date STRING, pos STRING) CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS ROW FORMAT DELIMITED ¡®\t¡¯ FIELDS TERMINATED BY '\n' STORED AS SEQUENCEFILE;
|
1.7 ´´½¨±í²¢´´½¨Ë÷Òý×Ö¶Îds
hive> CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING); |
1.8 ¸´ÖÆÒ»¸ö¿Õ±í
CREATE TABLE empty_key_value_store
LIKE key_value_store; |
Àý×Ó
create table user_info (user_id int, cid string, ckid string, username string) row format delimited fields terminated by '\t' lines terminated by '\n'; |
µ¼ÈëÊý¾Ý±íµÄÊý¾Ý¸ñʽÊÇ£º×Ö¶ÎÖ®¼äÊÇtab¼ü·Ö¸î£¬ÐÐÖ®¼äÊǶÏÐС£
¼°ÒªÎÒÃǵÄÎļþÄÚÈݸñʽ£º
100636 100890 c5c86f4cddc15eb7 yyyvybtvt 100612 100865 97cc70d411c18b6f gyvcycy 100078 100087 ecd6026a15ffddf5 qa000100 |
1.9 ÏÔʾËùÓÐ±í£º
hive> SHOW TABLES;
1.10 °´ÕýÌõ¼þ£¨ÕýÔò±í´ïʽ£©ÏÔʾ±í£¬
hive> SHOW TABLES '.*s';
Ôö¼Ó·ÖÇø¡¢É¾³ý·ÖÇø
ÖØÃüÃû±í
ÐÞ¸ÄÁеÄÃû×Ö¡¢ÀàÐÍ¡¢Î»Öá¢×¢ÊÍ
Ôö¼Ó/¸üÐÂÁÐ
Ôö¼Ó±íµÄÔªÊý¾ÝÐÅÏ¢
1.21 ±íÌí¼ÓÒ»ÁУº
hive> ALTER TABLE pokes ADD COLUMNS
(new_col INT);
1.22 Ìí¼ÓÒ»Áв¢Ôö¼ÓÁÐ×Ö¶Î×¢ÊÍ
hive> ALTER TABLE invites ADD COLUMNS
(new_col2 INT COMMENT 'a comment');
1.23 ¸ü¸Ä±íÃû£º
hive> ALTER TABLE events RENAME TO
3koobecaf;
1.24 ɾ³ýÁУº
hive> DROP TABLE pokes;
1.25 Ôö¼Ó¡¢É¾³ý·ÖÇø
Ôö¼Ó
ALTER TABLE table_name ADD [IF NOT EXISTS] partition_spec [ LOCATION 'location1' ] partition_spec [ LOCATION 'location2' ] ...
partition_spec:
: PARTITION (partition_col = partition_col_value,
partition_col = partiton_col_value, ...)
|
ɾ³ý
ALTER TABLE table_name DROP partition_spec,
partition_spec,...
1.26 ÖØÃüÃû±í
ALTER TABLE table_name RENAME TO new_table_name
1.27 ÐÞ¸ÄÁеÄÃû×Ö¡¢ÀàÐÍ¡¢Î»Öá¢×¢ÊÍ£º
ALTER TABLE table_name CHANGE [COLUMN]
col_old_name col_new_name column_type [COMMENT col_comment]
[FIRST|AFTER column_name]
Õâ¸öÃüÁî¿ÉÒÔÔÊÐí¸Ä±äÁÐÃû¡¢Êý¾ÝÀàÐÍ¡¢×¢ÊÍ¡¢ÁÐλÖûòÕßËüÃǵÄÈÎÒâ×éºÏ
1.28 ±íÌí¼ÓÒ»ÁУº
hive> ALTER TABLE pokes ADD COLUMNS
(new_col INT);
1.29 Ìí¼ÓÒ»Áв¢Ôö¼ÓÁÐ×Ö¶Î×¢ÊÍ
hive> ALTER TABLE invites ADD COLUMNS
(new_col2 INT COMMENT 'a comment');
1.30 Ôö¼Ó/¸üÐÂÁÐ
ALTER TABLE table_name ADD|REPLACE COLUMNS
(col_name data_type [COMMENT col_comment], ...)
ADDÊÇ´ú±íÐÂÔöÒ»×ֶΣ¬×Ö¶ÎλÖÃÔÚËùÓÐÁкóÃæ(partitionÁÐǰ)
REPLACEÔòÊDZíÊ¾Ìæ»»±íÖÐËùÓÐ×ֶΡ£
1.31 Ôö¼Ó±íµÄÔªÊý¾ÝÐÅÏ¢
ALTER TABLE table_name SET TBLPROPERTIES
table_properties table_properties:
:[property_name = property_value¡..]
Óû§¿ÉÒÔÓÃÕâ¸öÃüÁîÏò±íÖÐÔö¼Ómetadata
1.31¸Ä±ä±íÎļþ¸ñʽÓë×éÖ¯
ALTER TABLE table_name SET FILEFORMAT file_format
ALTER TABLE table_name CLUSTERED BY(userid) SORTED BY(viewTime) INTO num_buckets BUCKETS |
Õâ¸öÃüÁîÐÞ¸ÄÁ˱íµÄÎïÀí´æ´¢ÊôÐÔ
1.4 ´´½¨£¯É¾³ýÊÓͼ
CREATE VIEW [IF NOT EXISTS] view_name [ (column_name [COMMENT column_comment], ...)
][COMMENT view_comment][TBLPROPERTIES (property_name = property_value, ...)] AS SELECT |
Ôö¼ÓÊÓͼ
Èç¹ûûÓÐÌṩ±íÃû£¬ÊÓͼÁеÄÃû×Ö½«Óɶ¨ÒåµÄSELECT±í´ïʽ×Ô¶¯Éú³É
Èç¹ûÐ޸Ļù±¾±íµÄÊôÐÔ£¬ÊÓͼÖв»»áÌåÏÖ£¬ÎÞЧ²éѯ½«»áʧ°Ü
ÊÓͼÊÇÖ»¶ÁµÄ£¬²»ÄÜÓÃLOAD/INSERT/ALTER
DROP VIEW view_name
ɾ³ýÊÓͼ
1.5 ´´½¨Êý¾Ý¿â
CREATE DATABASE name
1.6 ÏÔʾÃüÁî
show tables; ?show databases; ?show partitions ; ?show functions ?describe extended table_name dot col_name |
2. DML ²Ù×÷:ÔªÊý¾Ý´æ´¢
hive²»Ö§³ÖÓÃinsertÓï¾äÒ»ÌõÒ»ÌõµÄ½øÐвåÈë²Ù×÷£¬Ò²²»Ö§³Öupdate²Ù×÷¡£Êý¾ÝÊÇÒÔloadµÄ·½Ê½¼ÓÔØµ½½¨Á¢ºÃµÄ±íÖС£Êý¾ÝÒ»µ©µ¼Èë¾Í²»¿ÉÒÔÐ޸ġ£
DML°üÀ¨£ºINSERT²åÈë¡¢UPDATE¸üС¢DELETEɾ³ý
ÏòÊý¾Ý±íÄÚ¼ÓÔØÎļþ
½«²éѯ½á¹û²åÈëµ½Hive±íÖÐ
0.8ÐÂÌØÐÔ insert into
2.1.0 ÏòÊý¾Ý±íÄÚ¼ÓÔØÎļþ
LOAD DATA [LOCAL] INPATH 'filepath'
[OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1,
partcol2=val2 ...)]
Load ²Ù×÷Ö»Êǵ¥´¿µÄ¸´ÖÆ/ÒÆ¶¯²Ù×÷£¬½«Êý¾ÝÎļþÒÆ¶¯µ½ Hive ±í¶ÔÓ¦µÄλÖá£
filepath
Ïà¶Ô·¾¶£¬ÀýÈ磺project/data1
¾ø¶Ô·¾¶£¬ÀýÈ磺 /user/hive/project/data1
°üº¬Ä£Ê½µÄÍêÕû URI£¬ÀýÈ磺hdfs://namenode:9000/user/hive/project/data1
ÀýÈ磺
hive> LOAD DATA LOCAL INPATH './examples/files/kv1.txt'
OVERWRITE INTO TABLE pokes;
2.1.1 ¼ÓÔØ±¾µØÊý¾Ý£¬Í¬Ê±¸ø¶¨·ÖÇøÐÅÏ¢
¼ÓÔØµÄÄ¿±ê¿ÉÒÔÊÇÒ»¸ö±í»òÕß·ÖÇø¡£Èç¹û±í°üº¬·ÖÇø£¬±ØÐëÖ¸¶¨Ã¿Ò»¸ö·ÖÇøµÄ·ÖÇøÃû
filepath ¿ÉÒÔÒýÓÃÒ»¸öÎļþ£¨ÕâÖÖÇé¿öÏ£¬Hive »á½«ÎļþÒÆ¶¯µ½±íËù¶ÔÓ¦µÄĿ¼ÖУ©»òÕßÊÇÒ»¸öĿ¼£¨ÔÚÕâÖÖÇé¿öÏ£¬Hive
»á½«Ä¿Â¼ÖеÄËùÓÐÎļþÒÆ¶¯ÖÁ±íËù¶ÔÓ¦µÄĿ¼ÖУ©
LOCAL¹Ø¼ü×Ö
Ö¸¶¨ÁËLOCAL£¬¼´±¾µØ
load ÃüÁî»áÈ¥²éÕÒ±¾µØÎļþϵͳÖÐµÄ filepath¡£Èç¹û·¢ÏÖÊÇÏà¶Ô·¾¶£¬Ôò·¾¶»á±»½âÊÍΪÏà¶ÔÓÚµ±Ç°Óû§µÄµ±Ç°Â·¾¶¡£Óû§Ò²¿ÉÒÔΪ±¾µØÎļþÖ¸¶¨Ò»¸öÍêÕûµÄ
URI£¬±ÈÈ磺file:///user/hive/project/data1.
load ÃüÁî»á½« filepath ÖеÄÎļþ¸´ÖƵ½Ä¿±êÎļþϵͳÖС£Ä¿±êÎļþϵͳÓɱíµÄλÖÃÊôÐÔ¾ö¶¨¡£±»¸´ÖƵÄÊý¾ÝÎļþÒÆ¶¯µ½±íµÄÊý¾Ý¶ÔÓ¦µÄλÖÃ
ÀýÈ磺¼ÓÔØ±¾µØÊý¾Ý£¬Í¬Ê±¸ø¶¨·ÖÇøÐÅÏ¢£º
hive> LOAD DATA LOCAL INPATH './examples/files/kv2.txt'
OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
ûÓÐÖ¸¶¨LOCAL
Èç¹û filepath Ö¸ÏòµÄÊÇÒ»¸öÍêÕûµÄ URI£¬hive »áÖ±½ÓʹÓÃÕâ¸ö URI¡£ ·ñÔò
Èç¹ûûÓÐÖ¸¶¨ schema »òÕß authority£¬Hive »áʹÓÃÔÚ
hadoop ÅäÖÃÎļþÖж¨ÒåµÄ schema ºÍ authority£¬fs.default.name Ö¸¶¨ÁË
Namenode µÄ URI
Èç¹û·¾¶²»ÊǾø¶ÔµÄ£¬Hive Ïà¶ÔÓÚ /user/ ½øÐнâÊÍ¡£ Hive
»á½« filepath ÖÐÖ¸¶¨µÄÎļþÄÚÈÝÒÆ¶¯µ½ table £¨»òÕß partition£©ËùÖ¸¶¨µÄ·¾¶ÖÐ
2.1.2 ¼ÓÔØDFSÊý¾Ý£¬Í¬Ê±¸ø¶¨·ÖÇøÐÅÏ¢£º
hive> LOAD DATA INPATH '/user/myname/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
The above command will load data from an HDFS file/directory to the table.
Note that loading data from HDFS will result in moving the file/directory. As a result, the operation is almost instantaneous. |
OVERWRITE
Ö¸¶¨ÁËOVERWRITE
Ä¿±ê±í£¨»òÕß·ÖÇø£©ÖеÄÄÚÈÝ£¨Èç¹ûÓУ©»á±»É¾³ý£¬È»ºóÔÙ½« filepath
Ö¸ÏòµÄÎļþ/Ŀ¼ÖеÄÄÚÈÝÌí¼Óµ½±í/·ÖÇøÖС£
Èç¹ûÄ¿±ê±í£¨·ÖÇø£©ÒѾÓÐÒ»¸öÎļþ£¬²¢ÇÒÎļþÃûºÍ filepath ÖеÄÎļþÃû³åÍ»£¬ÄÇôÏÖÓеÄÎļþ»á±»ÐÂÎļþËùÌæ´ú¡£
2.1.3 ½«²éѯ½á¹û²åÈëHive±í
½«²éѯ½á¹û²åÈëHive±í
½«²éѯ½á¹ûдÈëHDFSÎļþϵͳ
»ù±¾Ä£Ê½
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)]
select_statement1 FROM from_statement |
¶à²åÈëģʽ
FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ...] select_statement2] ...
|
×Ô¶¯·ÖÇøÄ£Ê½
INSERT OVERWRITE TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...)
select_statement FROM from_statement |
2.1.5 ½«²éѯ½á¹ûдÈëHDFSÎļþϵͳ
Êý¾ÝдÈëÎļþϵͳʱ½øÐÐÎı¾ÐòÁл¯£¬ÇÒÿÁÐÓÃ^A À´Çø·Ö£¬\n»»ÐÐ
2.1.6 INSERT INTO
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)]
select_statement1 FROM from_statement |
3. DQL ²Ù×÷:Êý¾Ý²éѯSQL
»ù±¾µÄSelect ²Ù×÷
»ùÓÚPartitionµÄ²éѯ
Join
3.1 »ù±¾µÄSelect ²Ù×÷
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list [HAVING condition]]
[ CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY| ORDER BY
col_list]
]
[LIMIT number]
|
ʹÓÃALLºÍDISTINCTÑ¡ÏîÇø·Ö¶ÔÖØ¸´¼Ç¼µÄ´¦Àí¡£Ä¬ÈÏÊÇALL£¬±íʾ²éѯËùÓмǼ¡£DISTINCT±íʾȥµôÖØ¸´µÄ¼Ç¼
Where Ìõ¼þ
ÀàËÆÎÒÃÇ´«Í³SQLµÄwhere Ìõ¼þ
Ŀǰ֧³Ö AND,OR ,0.9°æ±¾Ö§³Öbetween
IN, NOT IN
²»Ö§³ÖEXIST ,NOT EXIST
ORDER BYÓëSORT BYµÄ²»Í¬
ORDER BY È«¾ÖÅÅÐò£¬Ö»ÓÐÒ»¸öReduceÈÎÎñ
SORT BY Ö»ÔÚ±¾»ú×öÅÅÐò
Limit
Limit ¿ÉÒÔÏÞÖÆ²éѯµÄ¼Ç¼Êý
SELECT * FROM t1 LIMIT 5
ʵÏÖTop k ²éѯ
ÏÂÃæµÄ²éѯÓï¾ä²éѯÏúÊۼǼ×î´óµÄ 5 ¸öÏúÊÛ´ú±í¡£
SET mapred.reduce.tasks = 1
SELECT * FROM test SORT BY amount DESC
LIMIT 5
?REGEX Column Specification
SELECT Óï¾ä¿ÉÒÔʹÓÃÕýÔò±í´ïʽ×öÁÐÑ¡Ôñ£¬ÏÂÃæµÄÓï¾ä²éѯ³ýÁË ds ºÍ hr Ö®ÍâµÄËùÓÐÁУº
SELECT `(ds|hr)?+.+` FROM test
ÀýÈç
°´Ïȼþ²éѯ
hive> SELECT a.foo FROM invites a
WHERE a.ds='<DATE>';
½«²éѯÊý¾ÝÊä³öÖÁĿ¼£º
hive> INSERT OVERWRITE DIRECTORY
'/tmp/hdfs_out' SELECT a.* FROM invites a WHERE a.ds='<DATE>';
½«²éѯ½á¹ûÊä³öÖÁ±¾µØÄ¿Â¼£º
hive> INSERT OVERWRITE LOCAL DIRECTORY
'/tmp/local_out' SELECT a.* FROM pokes a;
Ñ¡ÔñËùÓÐÁе½±¾µØÄ¿Â¼£º
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a; hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a WHERE a.key < 100; hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/reg_3' SELECT a.* FROM events a; hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_4' select a.invites, a.pokes FROM profiles a; hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT COUNT(1) FROM invites a WHERE a.ds='<DATE>'; hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT a.foo, a.bar FROM invites a; hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/sum' SELECT SUM(a.pc) FROM pc1 a; |
½«Ò»¸ö±íµÄͳ¼Æ½á¹û²åÈëÁíÒ»¸ö±íÖУº
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT a.bar, count(1) WHERE a.foo > 0 GROUP BY a.bar; hive> INSERT OVERWRITE TABLE events SELECT a.bar,
count(1) FROM invites a WHERE a.foo > 0 GROUP BY a.bar; JOIN hive> FROM pokes t1 JOIN invites t2 ON (t1.bar = t2.bar)
INSERT OVERWRITE TABLE events SELECT t1.bar, t1.foo, t2.foo; |
½«¶à±íÊý¾Ý²åÈ뵽ͬһ±íÖУº
FROM src INSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key < 100 INSERT OVERWRITE TABLE dest2 SELECT src.key, src.value WHERE src.key >= 100 and src.key < 200 INSERT OVERWRITE TABLE dest3 PARTITION(ds='2008-04-08', hr='12')
SELECT src.key WHERE src.key >= 200 and src.key < 300 INSERT OVERWRITE LOCAL DIRECTORY '/tmp/dest4.out' SELECT src.value WHERE src.key >= 300; |
½«ÎļþÁ÷Ö±½Ó²åÈëÎļþ£º
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT
TRANSFORM(a.foo, a.bar) AS (oof, rab) USING '/bin/cat' WHERE a.ds > '2008-08-09'; This streams the data in the map phase through the script /bin/cat (like hadoop streaming).
Similarly - streaming can be used on the reduce side (please see the Hive Tutorial or examples) |
3.2 »ùÓÚPartitionµÄ²éѯ
Ò»°ã SELECT ²éѯ»áɨÃèÕû¸ö±í£¬Ê¹Óà PARTITIONED BY
×Ӿ佨±í£¬²éѯ¾Í¿ÉÒÔÀûÓ÷ÖÇø¼ôÖ¦£¨input pruning£©µÄÌØÐÔ
Hive µ±Ç°µÄʵÏÖÊÇ£¬Ö»ÓзÖÇø¶ÏÑÔ³öÏÖÔÚÀë FROM ×Ó¾ä×î½üµÄÄǸöWHERE
×Ó¾äÖУ¬²Å»áÆôÓ÷ÖÇø¼ôÖ¦
3.3 Join
join_table: table_reference JOIN table_factor [join_condition] | table_reference {LEFT|RIGHT|FULL} [OUTER] JOIN table_reference join_condition | table_reference LEFT SEMI JOIN table_reference join_condition
table_reference:
table_factor
| join_table
table_factor:
tbl_name [alias]
| table_subquery alias
| ( table_references )
join_condition:
ON equality_expression ( AND equality_expression
)*
equality_expression:
expression = expression |
Hive Ö»Ö§³ÖµÈÖµÁ¬½Ó£¨equality joins£©¡¢ÍâÁ¬½Ó£¨outer
joins£©ºÍ£¨left semi joins£©¡£Hive ²»Ö§³ÖËùÓзǵÈÖµµÄÁ¬½Ó£¬ÒòΪ·ÇµÈÖµÁ¬½Ó·Ç³£ÄÑת»¯µ½
map/reduce ÈÎÎñ
LEFT£¬RIGHTºÍFULL OUTER¹Ø¼ü×ÖÓÃÓÚ´¦ÀíjoinÖпռǼµÄÇé¿ö
LEFT SEMI JOIN ÊÇ IN/EXISTS ×Ó²éѯµÄÒ»ÖÖ¸ü¸ßЧµÄʵÏÖ
join ʱ£¬Ã¿´Î map/reduce ÈÎÎñµÄÂß¼ÊÇÕâÑùµÄ£ºreducer
»á»º´æ join ÐòÁÐÖгýÁË×îºóÒ»¸ö±íµÄËùÓбíµÄ¼Ç¼£¬ÔÙͨ¹ý×îºóÒ»¸ö±í½«½á¹ûÐòÁл¯µ½Îļþϵͳ
ʵ¼ùÖУ¬Ó¦¸Ã°Ñ×î´óµÄÄǸö±íдÔÚ×îºó
join ²éѯʱ£¬ÐèҪעÒ⼸¸ö¹Ø¼üµã
Ö»Ö§³ÖµÈÖµjoin
SELECT a.* FROM a JOIN b ON (a.id =
b.id)
SELECT a.* FROM a JOIN b
ON (a.id = b.id AND a.department = b.department)
¿ÉÒÔ join ¶àÓÚ 2 ¸ö±í£¬ÀýÈç
SELECT a.val, b.val, c.val FROM a JOIN
b
ON (a.key = b.key1) JOIN c ON (c.key
= b.key2)
Èç¹ûjoinÖжà¸ö±íµÄ join key ÊÇͬһ¸ö£¬Ôò join »á±»×ª»¯Îªµ¥¸ö
map/reduce ÈÎÎñ
LEFT£¬RIGHTºÍFULL OUTER
Àý×Ó
SELECT a.val, b.val FROM a LEFT OUTER
JOIN b ON (a.key=b.key)
Èç¹ûÄãÏëÏÞÖÆ join µÄÊä³ö£¬Ó¦¸ÃÔÚ WHERE ×Ó¾äÖÐд¹ýÂËÌõ¼þ¡ª¡ª»òÊÇÔÚ
join ×Ó¾äÖÐд
ÈÝÒ×»ìÏýµÄÎÊÌâÊDZí·ÖÇøµÄÇé¿ö
¡¡SELECT c.val, d.val FROM c LEFT OUTER
JOIN d ON (c.key=d.key)
WHERE a.ds='2010-07-07' AND b.ds='2010-07-07¡®
Èç¹û d ±íÖÐÕÒ²»µ½¶ÔÓ¦ c ±íµÄ¼Ç¼£¬d ±íµÄËùÓÐÁж¼»áÁгö NULL£¬°üÀ¨
ds ÁС£Ò²¾ÍÊÇ˵£¬join »á¹ýÂË d ±íÖв»ÄÜÕÒµ½Æ¥Åä c ±í join key µÄËùÓмǼ¡£ÕâÑùµÄ»°£¬LEFT
OUTER ¾ÍʹµÃ²éѯ½á¹ûÓë WHERE ×Ó¾äÎÞ¹Ø
½â¾ö°ì·¨
SELECT c.val, d.val FROM c LEFT OUTER
JOIN d
ON (c.key=d.key AND d.ds='2009-07-07'
AND c.ds='2009-07-07')
LEFT SEMI JOIN
LEFT SEMI JOIN µÄÏÞÖÆÊÇ£¬ JOIN ×Ó¾äÖÐÓұߵıíÖ»ÄÜÔÚ
ON ×Ó¾äÖÐÉèÖùýÂËÌõ¼þ£¬ÔÚ WHERE ×Ӿ䡢SELECT ×Ó¾ä»òÆäËûµØ·½¹ýÂ˶¼²»ÐÐ
SELECT a.key, a.value
FROM a
WHERE a.key in
(SELECT b.key
FROM B);
¿ÉÒÔ±»ÖØÐ´Îª£º
SELECT a.key, a.val
FROM a LEFT SEMI JOIN b on (a.key =
b.key)
UNION ALL
ÓÃÀ´ºÏ²¢¶à¸öselectµÄ²éѯ½á¹û£¬ÐèÒª±£Ö¤selectÖÐ×Ö¶ÎÐëÒ»ÖÂ
select_statement UNION ALL select_statement
UNION ALL select_statement ...
4. ´ÓSQLµ½HiveQLӦת±äµÄϰ¹ß
4.1¡¢Hive²»Ö§³ÖµÈÖµÁ¬½Ó
SQLÖжÔÁ½±íÄÚÁª¿ÉÒÔд³É£º
select * from dual a,dual b where a.key = b.key;
HiveÖÐӦΪ
select * from dual a join dual b on a.key = b.key;
¶ø²»ÊÇ´«Í³µÄ¸ñʽ£º
SELECT t1.a1 as c1, t2.b1 as c2FROM t1, t2 WHERE t1.a2
= t2.b2
4.2¡¢·ÖºÅ×Ö·û
·ÖºÅÊÇSQLÓï¾ä½áÊø±ê¼Ç£¬ÔÚHiveQLÖÐÒ²ÊÇ£¬µ«ÊÇÔÚHiveQLÖУ¬¶Ô·ÖºÅµÄʶ±ðûÓÐÄÇôÖǻۣ¬ÀýÈ磺
select concat(key,concat(';',key)) from
dual;
µ«HiveQLÔÚ½âÎöÓï¾äʱÌáʾ£º
FAILED: Parse Error: line 0:-1 mismatched input '<EOF>'
expecting ) in function specification
½â¾öµÄ°ì·¨ÊÇ£¬Ê¹Ó÷ֺŵİ˽øÖƵÄASCIIÂë½øÐÐתÒ壬ÄÇôÉÏÊöÓï¾äӦд³É£º
select concat(key,concat('\073',key))
from dual;
4.3¡¢IS [NOT] NULL
SQLÖÐnull´ú±í¿ÕÖµ, ÖµµÃ¾¯ÌèµÄÊÇ, ÔÚHiveQLÖÐStringÀàÐ͵Ä×Ö¶ÎÈôÊÇ¿Õ(empty)×Ö·û´®,
¼´³¤¶ÈΪ0, ÄÇô¶ÔËü½øÐÐIS NULLµÄÅжϽá¹ûÊÇFalse.
4.4¡¢Hive²»Ö§³Ö½«Êý¾Ý²åÈëÏÖÓеıí»ò·ÖÇøÖУ¬
½öÖ§³Ö¸²¸ÇÖØÐ´Õû¸ö±í£¬Ê¾ÀýÈçÏ£º
INSERT OVERWRITE TABLE t1
SELECT * FROM t2; INSERT OVERWRITE TABLE
t1SELECT * FROM t2;
4.5¡¢hive²»Ö§³ÖINSERT INTO, UPDATE, DELETE²Ù×÷
ÕâÑùµÄ»°£¬¾Í²»ÒªºÜ¸´ÔÓµÄËø»úÖÆÀ´¶ÁдÊý¾Ý¡£
INSERT INTO syntax is only available
starting in version 0.8¡£INSERT INTO¾ÍÊÇÔÚ±í»ò·ÖÇøÖÐ×·¼ÓÊý¾Ý¡£
4.6¡¢hiveÖ§³ÖǶÈëmapreduce³ÌÐò£¬À´´¦Àí¸´ÔÓµÄÂß¼
È磺
FROM (
MAP doctext USING 'python wc_mapper.py' AS (word,
cnt)
FROM docs
CLUSTER BY word
) a
REDUCE word, cnt USING 'python wc_reduce.py';
FROM (
MAP doctext USING 'python wc_mapper.py' AS (word,
cnt)
FROM docs
CLUSTER BY word
) a
REDUCE word, cnt USING 'python wc_reduce.py'; |
--doctext: ÊÇÊäÈë
--word, cnt: ÊÇmap³ÌÐòµÄÊä³ö
--CLUSTER BY: ½«wordhashºó£¬ÓÖ×÷Ϊreduce³ÌÐòµÄÊäÈë
²¢ÇÒmap³ÌÐò¡¢reduce³ÌÐò¿ÉÒÔµ¥¶ÀʹÓã¬È磺
FROM (
FROM session_table
SELECT sessionid, tstamp, data
DISTRIBUTE BY sessionid SORT BY tstamp
) a
REDUCE sessionid, tstamp, data USING 'session_reducer.sh';
FROM (
FROM session_table
SELECT sessionid, tstamp, data
DISTRIBUTE BY sessionid SORT BY tstamp
) a
REDUCE sessionid, tstamp, data USING 'session_reducer.sh'; |
--DISTRIBUTE BY: ÓÃÓÚ¸øreduce³ÌÐò·ÖÅäÐÐÊý¾Ý
4.7¡¢hiveÖ§³Ö½«×ª»»ºóµÄÊý¾ÝÖ±½ÓдÈ벻ͬµÄ±í£¬»¹ÄÜдÈë·ÖÇø¡¢hdfsºÍ±¾µØÄ¿Â¼¡£
ÕâÑùÄÜÃâ³ý¶à´ÎɨÃèÊäÈë±íµÄ¿ªÏú¡£
FROM t1
INSERT OVERWRITE TABLE t2
SELECT t3.c2, count(1)
FROM t3
WHERE t3.c1 <= 20
GROUP BY t3.c2
INSERT OVERWRITE DIRECTORY '/output_dir'
SELECT t3.c2, avg(t3.c1)
FROM t3
WHERE t3.c1 > 20 AND t3.c1 <= 30
GROUP BY t3.c2
INSERT OVERWRITE LOCAL DIRECTORY '/home/dir'
SELECT t3.c2, sum(t3.c1)
FROM t3
WHERE t3.c1 > 30
GROUP BY t3.c2; |
5. ʵ¼ÊʾÀý
5.1 ´´½¨Ò»¸ö±í
CREATE TABLE u_data ( userid INT, movieid INT, rating INT, unixtime STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '/t' STORED AS TEXTFILE; |
ÏÂÔØÊ¾ÀýÊý¾ÝÎļþ£¬²¢½âѹËõ
wget http://www.grouplens.org/system/files/ml-data.tar__0.gz
tar xvzf ml-data.tar__0.gz
5.2 ¼ÓÔØÊý¾Ýµ½±íÖÐ:
LOAD DATA LOCAL INPATH 'ml-data/u.data' OVERWRITE INTO TABLE u_data; |
5.3 ͳ¼ÆÊý¾Ý×ÜÁ¿:
SELECT COUNT(1) FROM u_data;
5.4 ÏÖÔÚ×öһЩ¸´ÔÓµÄÊý¾Ý·ÖÎö:
´´½¨Ò»¸ö weekday_mapper.py: Îļþ£¬×÷ΪÊý¾Ý°´ÖܽøÐзָî
import sys import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('/t') |
5.5 Éú³ÉÊý¾ÝµÄÖÜÐÅÏ¢
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday() print '/t'.join([userid, movieid, rating, str(weekday)]) |
5.6 ʹÓÃÓ³Éä½Å±¾
//´´½¨±í£¬°´·Ö¸î·û·Ö¸îÐÐÖеÄ×Ö¶ÎÖµ
CREATE TABLE u_data_new ( userid INT, movieid INT, rating INT, weekday INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY '/t'; |
//½«pythonÎļþ¼ÓÔØµ½ÏµÍ³
add FILE weekday_mapper.py;
5.7 ½«Êý¾Ý°´ÖܽøÐзָî
INSERT OVERWRITE TABLE u_data_new SELECT TRANSFORM (userid, movieid, rating, unixtime) USING 'python weekday_mapper.py' AS (userid, movieid, rating, weekday) FROM u_data;
SELECT weekday, COUNT(1)
FROM u_data_new
GROUP BY weekday;
|
´¦ÀíApache Weblog Êý¾Ý
½«WEBÈÕÖ¾ÏÈÓÃÕýÔò±í´ïʽ½øÐÐ×éºÏ£¬ÔÙ°´ÐèÒªµÄÌõ¼þ½øÐÐ×éºÏÊäÈëµ½±íÖÐ
add jar ../build/contrib/hive_contrib.jar;
CREATE TABLE apachelog (
host STRING,
identity STRING,
user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*)
([^ ]*) (-|//[[^//]]*//]) ([^ /"]*|/"[^/"]*/")
(-|[0-9]*) (-|[0-9]*)(?: ([^ /"]*|/"[^/"]*/")
([^ /"]*|/"[^/"]*/"))?",
"output.format.string" = "%1$s
%2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE; |
|