本文在上一篇的基础上测试数据的导入。
- 在
slave1
机器上准备一份数据,格式如timestamp,value,tags
:
# pwd
/home/gpadmin/ext_files
# head -5 data.csv
1446559211,27,2395
1446559226,24,2395
1446559241,28,2395
1446559256,25,2395
1446559271,33,2395
# wc -l data.csv
1590421 data.csv
slave1
启动gpfdist
服务:
gpfdist -d /home/gpadmin/ext_files -p 8081 -l /tmp/gpfdist.8081.log
- 从
master
进入greenplum
su - gpadmin
psql -d postgres
- 创建测试数据库
postgres=# CREATE DATABASE exampledb OWNER gpadmin;
CREATE DATABASE
postgres=# \l
List of databases
Name | Owner | Encoding | Access privileges
-----------+---------+----------+---------------------
exampledb | gpadmin | UTF8 |
postgres | gpadmin | UTF8 |
template0 | gpadmin | UTF8 | =c/gpadmin
: gpadmin=CTc/gpadmin
template1 | gpadmin | UTF8 | =c/gpadmin
: gpadmin=CTc/gpadmin
(4 rows)
- 进入测试数据库
psql -d exampledb
- 创建外部表
exampledb=# CREATE EXTERNAL TABLE data_ext ( timestamp bigint, value bigint, sid int ) location ( 'gpfdist://slave1:8081/data.csv' ) FORMAT 'CSV' ( DELIMITER ',' );
CREATE EXTERNAL TABLE
- 创建表
exampledb=# CREATE TABLE data ( timestamp bigint, value bigint, sid int ) DISTRIBUTED RANDOMLY;
CREATE TABLE
- 数据导入
exampledb=# INSERT INTO cpu SELECT * FROM cpu_ext;
INSERT 0 1590421
可以看到,插入的行数是吻合的。
- 测试查询
exampledb=# \timing
Timing is on.
exampledb=# SELECT COUNT(1) FROM data;
count
---------
1590421
(1 row)
Time: 797.723 ms
exampledb=# SELECT sid, MAX(value) from data GROUP BY sid ORDER BY 2 DESC LIMIT 10;
sid | max
------+-----
5293 | 841
7086 | 820
7058 | 798
5272 | 783
7077 | 777
7509 | 775
5278 | 759
7504 | 748
7060 | 734
7505 | 726
(10 rows)
Time: 982.166 ms