Postgres XL中FQS与Subquery性能比较

本篇基于Postgres XL9.5R1.6+修复部分bug。

Postgres XL提供了一个叫做Fast Query Shipping(FQS)的技术。当语句可以被直接传递给一个或多个datanode执行,并且仅在coordinator合并datanode的返回结果时,PGXL会用FQS。否则用常规的Subquery,在coordinator上生成执行计划,将计划传递给一个或多个datanode。两者在不同场景下的性能差异较大。

准备

初始化pgbench数据,此处目的是为了使用pgbench_accounts表,

pgbench -h 172.16.0.208 -p 21001 -U postgres -i -s 100

创建pgbench_accounts_msg表,用于与pgbench_accounts表关联,测试表关联语句,

CREATE TABLE pgbench_accounts_msg (l_aid integer, r_aid integer, ctime timestamp default now()) DISTRIBUTE BY HASH(l_aid);
CREATE INDEX idx_pgbench_accounts_msg_1 ON pgbench_accounts_msg(l_aid);
CREATE INDEX idx_pgbench_accounts_msg_2 ON pgbench_accounts_msg(r_aid);

创建函数,

-- 更新
CREATE OR REPLACE FUNCTION func_update(p_aid integer, p_abalance integer, opt integer) RETURNS VOID AS $$
BEGIN
    IF opt = 1 THEN
        EXECUTE 'UPDATE pgbench_accounts SET abalance = abalance + ' || p_abalance || ' WHERE aid = ' || p_aid;
    ELSIF opt = 2 THEN
        EXECUTE 'UPDATE pgbench_accounts SET abalance = abalance + $1 WHERE aid = ' || p_aid USING p_abalance;
    ELSIF opt = 3 THEN
        UPDATE pgbench_accounts SET abalance = abalance + p_abalance WHERE aid = p_aid;
    ELSIF opt = 4 THEN
        EXECUTE 'UPDATE pgbench_accounts SET abalance = abalance + $1 WHERE aid = $2' USING p_abalance, p_aid;
    ELSIF opt = 5 THEN
        UPDATE pgbench_accounts SET abalance = (SELECT abalance + p_abalance) WHERE aid = p_aid;
    ELSIF opt = 6 THEN
        SET enable_fast_query_shipping = off;
        UPDATE pgbench_accounts SET abalance = abalance + p_abalance WHERE aid = p_aid;
        SET enable_fast_query_shipping = on;
    END IF;
END;
$$ LANGUAGE plpgsql;

-- 插入
CREATE OR REPLACE FUNCTION func_insert(p_aid integer, p_aid2 integer, opt integer) RETURNS VOID AS $$
DECLARE
    tms timestamp;
BEGIN
    IF opt = 1 THEN
        tms := now();
        INSERT INTO pgbench_accounts_msg VALUES (p_aid, p_aid2, tms);
    ELSIF opt = 2 THEN
        INSERT INTO pgbench_accounts_msg VALUES (p_aid, p_aid2, now());
        --INSERT INTO pgbench_accounts_msg VALUES (p_aid, p_aid2);
    ELSIF opt = 3 THEN
        INSERT INTO pgbench_accounts (aid) VALUES (p_aid) ON CONFLICT DO NOTHING;
    ELSIF opt = 4 THEN
        INSERT INTO pgbench_accounts (aid) VALUES (p_aid) ON CONFLICT(aid) DO UPDATE SET abalance = EXCLUDED.abalance + 1;
    END IF;
END;
$$ LANGUAGE plpgsql;

-- 查询
CREATE OR REPLACE FUNCTION func_select(p_aid integer, opt integer) RETURNS TABLE (r_aid integer, r_bid integer, r_abalance integer) AS $$
DECLARE
    ui_row  pgbench_accounts%ROWTYPE;
BEGIN
    IF opt = 1 THEN
        EXECUTE 'SELECT * FROM pgbench_accounts WHERE aid = ' || p_aid INTO ui_row;
    ELSIF opt = 2 THEN
        SELECT * INTO ui_row FROM pgbench_accounts WHERE aid = p_aid;
    ELSIF opt = 3 THEN
        EXECUTE 'SELECT * FROM pgbench_accounts WHERE aid = $1' INTO ui_row USING p_aid;
    ELSIF opt = 4 THEN
        EXECUTE 'SELECT * FROM (SELECT * FROM pgbench_accounts WHERE aid = $1) a' INTO ui_row USING p_aid;
    ELSIF opt = 5 THEN
        SELECT * INTO ui_row FROM (SELECT * FROM pgbench_accounts WHERE aid = p_aid) a;
    ELSIF opt = 6 THEN
        SET enable_fast_query_shipping = off;
        SELECT * INTO ui_row FROM pgbench_accounts WHERE aid = p_aid;
        SET enable_fast_query_shipping = on;
    END IF;
    RETURN QUERY SELECT ui_row.aid, ui_row.bid, ui_row.abalance;
END;
$$ LANGUAGE plpgsql;

-- 联表查询
CREATE OR REPLACE FUNCTION func_select_join(p_aid integer, opt integer) RETURNS TABLE (r_aid integer, r_bid integer, r_abalance integer) AS $$
DECLARE
    ui_row  pgbench_accounts%ROWTYPE;
BEGIN
    IF opt = 1 THEN
        SELECT a.* INTO ui_row FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.l_aid WHERE a.aid = p_aid;
    ELSIF opt = 2 THEN
        EXECUTE 'SELECT * FROM (SELECT a.* FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.l_aid WHERE a.aid = ' || p_aid || ') t' INTO ui_row;
    ELSIF opt = 3 THEN
        EXECUTE 'SELECT * FROM (SELECT a.* FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.l_aid WHERE a.aid = $1) t' INTO ui_row USING p_aid;
    ELSIF opt = 11 THEN
        EXECUTE 'SELECT a.* FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.r_aid WHERE a.aid = $1' INTO ui_row USING p_aid;
    ELSIF opt = 12 THEN
        SELECT a.* INTO ui_row FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.r_aid WHERE a.aid = aid;
    END IF;
    RETURN QUERY SELECT ui_row.aid, ui_row.bid, ui_row.abalance;
END;
$$ LANGUAGE plpgsql;

-- 删除
CREATE OR REPLACE FUNCTION func_delete(p_aid integer, opt integer) RETURNS VOID AS $$
BEGIN
    IF opt = 1 THEN
        EXECUTE 'DELETE FROM pgbench_accounts_msg WHERE l_aid = ' || p_aid;
    ELSIF opt = 2 THEN
        DELETE FROM pgbench_accounts_msg WHERE l_aid = p_aid;
    ELSIF opt = 3 THEN
        SET enable_fast_query_shipping = off;
        DELETE FROM pgbench_accounts_msg WHERE l_aid = p_aid;
        SET enable_fast_query_shipping = on;
    END IF;
END;
$$ LANGUAGE plpgsql;

pgbench压测命令,

pgbench -p 21001 -h 172.16.0.208 -U postgres -n -r -P 1 -f script.sql -c 64 -j 64 -T 120

pgbench压测脚本script.sql,

\set scale 100
\set nbranches 1 * :scale
\set ntellers 10 * :scale
\set naccounts 100000 * :scale
\setrandom aid 1 :naccounts
\setrandom aid2 1 :naccounts
\setrandom bid 1 :nbranches
\setrandom tid 1 :ntellers
\setrandom delta -5000 5000

--UPDATE pgbench_accounts SET abalance = abalance + :delta WHERE aid = :aid;
--UPDATE pgbench_accounts SET abalance = (select abalance + :delta) WHERE aid = :aid;
--select * from func_update(:aid, :delta, 6);

--insert into pgbench_accounts_msg values (:aid, :aid + :delta, '2018-04-03 06:35:51.766286+00');
--insert into pgbench_accounts_msg values (:aid, :aid + :delta, now());
--select * from func_insert(:aid, :aid2, 4);

--select * from pgbench_accounts where aid = :aid;
--SELECT * FROM (SELECT * FROM pgbench_accounts WHERE aid = :aid) a;
--select * from func_select(:aid, 6);

--select * from pgbench_accounts a left join pgbench_accounts_msg b on a.aid = b.l_aid where a.aid = :aid;
--select * from (select * from pgbench_accounts a left join pgbench_accounts_msg b on a.aid = b.l_aid where a.aid = :aid) t;
--select a.* from pgbench_accounts a left join pgbench_accounts_msg b on a.aid = b.r_aid where a.aid = :aid;
--select * from func_select_join(:aid, 12);

--delete from pgbench_accounts_msg where l_aid = :aid;
--select * from func_delete(:aid, 3);

执行计划解析

常规SQL使用explain verbose即可得到执行计划。对于函数,使用auto_explain插件把函数内SQL语句的执行计划输出到日志后查看。通过执行计划可以知道语句的执行涉及到哪些datanode,以哪种方式(FQS或Remote Subquery)执行。

压测结果

在以下测试中,pgbench_accounts_msg表的数据量是pgbench_accounts的1.2倍。【plan】里RS的意思是Remote Subquery。【dn数】里all的意思是语句或执行计划被发送给所有datanode(即distribute by hash的表所在的那些datanode),one的意思是规划器通过分析语句计算得到数据所在的那个datanode,然后只去那个datanode执行。【TPS】是pgbench压测2分钟得到的结果,每秒完成事务数。【负载】是linux里的load average。【网络IO】是通过nload计算得到的网络输入和网络输出的平均值,单位MBit/s。

UPDATE … SET … WHERE [distribute key] = …

【SQL】 【调用方式】 【plan】 【dn数】 【TPS】 【负载】 【网络IO】
【dn】 【cn】 【dn】 【cn】
UPDATE pgbench_accounts SET abalance = abalance + p_abalance WHERE aid = :aid; 常规 FQS one 14500 1.5 10 20/20 85/89
UPDATE pgbench_accounts SET abalance = (select abalance + :delta) WHERE aid = :aid; 常规 RS one 9500 1.5 14 119/21 91/475
EXECUTE ‘UPDATE pgbench_accounts SET abalance = abalance + ‘ || p_abalance || ‘ WHERE aid = ‘ || p_aid; 函数内 FQS one 10600 1.5 12 14/14 61/67
EXECUTE ‘UPDATE pgbench_accounts SET abalance = abalance + $1 WHERE aid = ‘ || p_aid USING p_abalance; 函数内 FQS one 9200 1.5 13 12/14 56/62
UPDATE pgbench_accounts SET abalance = abalance + p_abalance WHERE aid = p_aid; 函数内 FQS all 4000 5 8 53/61 191/208
EXECUTE ‘UPDATE pgbench_accounts SET abalance = abalance + $1 WHERE aid = $2’ USING p_abalance, p_aid; 函数内 FQS all 3300 3.5 10 47/45 168/183
UPDATE pgbench_accounts SET abalance = (select abalance + p_abalance) WHERE aid = p_aid; 函数内 RS one 3200 3 12 188/50 175/756
SET enable_fast_query_shipping = off;
UPDATE pgbench_accounts SET abalance = abalance + p_abalance WHERE aid = p_aid;
SET enable_fast_query_shipping = on;
函数内 RS one 3000 3 10 166/54 175/640

INSERT INTO … VALUES …

【SQL】 【调用方式】 【plan】 【dn数】 【TPS】 【负载】 【网络IO】
【dn】 【cn】 【dn】 【cn】
INSERT INTO pgbench_accounts_msg VALUES (:aid, :aid + :delta, ‘2018-04-03 06:35:51.766286+00’); 常规 FQS one 15500 1.5 12 21/31 112/95
INSERT INTO pgbench_accounts_msg VALUES (:aid, :aid + :delta, now()); 常规 RS one 11500 1.5 12 75/31 112/302
tms := now();
INSERT INTO pgbench_accounts_msg VALUES (p_aid, p_aid2, tms);
函数内 FQS one 11100 1 12 17/23 69/85
INSERT INTO pgbench_accounts_msg VALUES (p_aid, p_aid + p_abalance, now()); 函数内 RS one 2900 7 11 212/121 157/526
INSERT INTO pgbench_accounts (aid) VALUES (p_aid) ON CONFLICT DO NOTHING; 函数内 RS one 2700 7 9 207/156 202/582
INSERT INTO pgbench_accounts (aid) VALUES (p_aid) ON CONFLICT(aid) DO UPDATE SET abalance = EXCLUDED.abalance + 1; 函数内 RS one 2500 7 11 260/136 167/692

SELECT … FROM … WHERE [distribute key] = …

【SQL】 【调用方式】 【plan】 【dn数】 【TPS】 【负载】 【网络IO】
【dn】 【cn】 【dn】 【cn】
SELECT * FROM pgbench_accounts WHERE aid = :aid; 常规 FQS one 21300 1.5 16 19/28 153/135
SELECT * FROM (SELECT * FROM pgbench_accounts WHERE aid = :aid) a; 常规 RS one 12900 1 16 110/26 123/469
EXECUTE ‘SELECT * FROM pgbench_accounts WHERE aid = ‘ || p_aid INTO ui_row; 函数内 FQS one 11300 0.6 13 10/13 74/70
SELECT * INTO ui_row FROM pgbench_accounts WHERE aid = p_aid; 函数内 FQS all 8700 3 14 36/36 166/171
EXECUTE ‘SELECT * FROM pgbench_accounts WHERE aid = $1’ INTO ui_row USING p_aid; 函数内 FQS all 7200 3 14 29/29 136/140
EXECUTE ‘SELECT * FROM (SELECT * FROM pgbench_accounts WHERE aid = $1) a’ INTO ui_row USING p_aid; 函数内 RS one 6800 0.6 14 59/13 69/250
SELECT * INTO ui_row FROM (SELECT * FROM pgbench_accounts WHERE aid = p_aid) a; 函数内 RS one 4700 2.5 12 191/34 147/761
SET enable_fast_query_shipping = off;
SELECT * INTO ui_row FROM pgbench_accounts WHERE aid = p_aid;
SET enable_fast_query_shipping = on;
函数内 RS one 2800 3 9 137/44 166/531

# SELECT … FROM … LEFT JOIN … ON [distribute key] WHERE [distribute key] = …

【SQL】 【调用方式】 【plan】 【dn数】 【TPS】 【负载】 【网络IO】
【dn】 【cn】 【dn】 【cn】
SELECT * FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.l_aid WHERE a.aid = :aid; 常规 FQS all 11000 4.5 11 48/55 245/242
SELECT * FROM (SELECT * FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.l_aid WHERE a.aid = :aid) t; 常规 RS one 9400 1 18 179/22 108/738
SELECT a.* INTO ui_row FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.l_aid WHERE a.aid = p_aid; 函数内 FQS all 8800 4.5 11 40/36 165/184
EXECUTE ‘SELECT * FROM (SELECT a.* FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.l_aid WHERE a.aid = ‘ || p_aid || ‘) t’ INTO ui_row; 函数内 RS one 6700 0.6 14 104/14 72/426
EXECUTE ‘SELECT * FROM (SELECT a.* FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.l_aid WHERE a.aid = $1) t’ INTO ui_row USING p_aid; 函数内 RS one 6200 0.5 12 98/13 68/399

SELECT … FROM … LEFT JOIN … ON [non-distribute key] WHERE [distribute key] = …

【SQL】 【调用方式】 【plan】 【dn数】 【TPS】 【负载】 【网络IO】
【dn】 【cn】 【dn】 【cn】
SELECT a.* FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.r_aid WHERE a.aid = :aid; 常规 RS all 3200 5 9 322/110 108/738
EXECUTE ‘SELECT a.* FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.r_aid WHERE a.aid = $1’ INTO ui_row USING p_aid; 函数内 RS all 2700 3 9 262/90 91/751
SELECT a.* INTO ui_row FROM pgbench_accounts a LEFT JOIN pgbench_accounts_msg b ON a.aid = b.r_aid WHERE a.aid = aid; 函数内 RS all crash

DELETE FROM … WHERE [distribute key] = …

【SQL】 【调用方式】 【plan】 【dn数】 【TPS】 【负载】 【网络IO】
【dn】 【cn】 【dn】 【cn】
DELETE FROM pgbench_accounts_msg WHERE l_aid = :aid; 常规 FQS one 16800 1 11 18/21 85/91
EXECUTE ‘DELETE FROM pgbench_accounts_msg WHERE l_aid = ‘ || p_aid; 函数内 FQS one 10900 1 11 12/14 60/66
DELETE FROM pgbench_accounts_msg WHERE l_aid = p_aid; 函数内 FQS all 4000 4 9 55/70 195/212
SET enable_fast_query_shipping = off;
DELETE FROM pgbench_accounts_msg WHERE l_aid = p_aid;
SET enable_fast_query_shipping = on;
函数内 RS one 3000 3.5 9 157/60 192/606

结论

理论上FQS + one是最经济高效的,但是当前版本的FQS存在缺陷,不是所有看似可以FQS + one的语句都会这么执行。

直接调用以“[distribute key] = ”作为where条件的常规update、delete、select语句,都是采取FQS + one的执行方式。当把这类语句放在plpgsql函数内调用,或使用prepare statement时,则采取FQS + all的执行方式。因为FQS还不能从运行时变量中分析得出数据所在的datanode,所以只能把语句发给所有datanode执行。这种all的策略是需要尽量避免的,它不仅使语句容易因无关datanode的故障而失败,而且造成数倍的资源浪费,使集群丧失了伸缩性。为了使这类语句在plpgsql函数内的调用也采取FQS + one的方式,可以拼接distribute key构造动态SQL,通过EXECUTE执行。这是一种折中策略,但是在当前版本下推荐使用。

常规的insert语句,无论是直接调用还是其他方式调用,都采取FQS + one的方式。

不同的distribute by hash的表,如果他们的distribute key类型相同,且数据分布的datanode完全一样,则他们之间相同key值的数据在同一个datanode上。这是与程序的设计和实现相关的,是一个潜规则,准确的做法是用key值去分别判断每一个表的数据所在的datanode是哪个,然后再看他们是否是同一个datanode。由这个规律可看出,在使用distribute key做联表查询,且以“[distribute key] = ”作为where条件时,可以预知是否只需要某一个datanode参与就可以了。但是当前版本的FQS还做不到,这类联表查询采取的是FQS + all的方式。对于这类语句的优化,可以通过临时关闭FQS迫使其采取RS + one的方式执行,虽然效率可能略低于FQS + all。

FQS在选择datanode时,只有one和all,不存在partial。对于以“[distribute key] in ”作为where条件语句,基本上都是采取FQS + all的执行方式。而不会根据值的具体情况做出只去部分datanode执行的判断。

FQS除了在one和all之间的选择上存在不足外,还只适用于非常简单的SQL,稍微复杂一点的都还未支持。例如,

  • SQL中用到了函数。包括select返回值、where条件里、insert的value值里、update的set值里用到类似max()、now()等任何函数。
  • 表中字段缺省值为函数,且语句执行会用到。比如表pgbench_accounts_msg定义了字段ctime timestamp default now(),而insert时没有给ctime赋值
  • 使用了子查询。
  • 使用了on conflict、returning等关键字。

以上情况将采取常规的Remote Subquery方式执行。总体上Remote Subquery的性能低于FQS,并且在coordinator到datanode的网络传输上对带宽消耗严重,普通千兆网较难承受,所以在SQL的写法上要仔细权衡。

Creative Commons License

本文基于署名-非商业性使用-相同方式共享 4.0许可协议发布,欢迎转载、使用、重新发布,但请保留文章署名wanghengbin(包含链接:https://wanghengbin.com),不得用于商业目的,基于本文修改后的作品请以相同的许可发布。

发表评论