这条SQL DuckDB只花了1秒,甩开好友20倍+

2024年 1月 2日 97.3k 0

PostgreSQL, MySQL, DuckDB, DuckDB+mysqlscanner 1000万记录的简单分析SQL对比

标签

PostgreSQL , DuckDB , duckdb mysqlscanner , MySQL , 分析SQL , 性能对比,非内存照样秒见:20220901_05

背景

DuckDB发布了MySQL scanner, 支持在DuckDB内直接读写mysql table, 本文对比了PostgreSQL, MySQL, DuckDB, DuckDB+mysqlscanner 1000万记录的简单分析SQL. 结果当然是DuckDB完胜. 但是DuckDB+mysqlscanner给了MySQL一个不错的选择, 0成本加速MySQL分析.

本例使用以下docker images测试:

x86_64

# 拉取镜像, 第一次拉取一次即可. 或者需要的时候执行, 将更新到最新镜像版本.
docker pull registry.cn-hangzhou.aliyuncs.com/digoal/opensource_database:pg14_with_exts

# 启动容器
docker run --platform linux/amd64 -d -it -P --cap-add=SYS_PTRACE --cap-add SYS_ADMIN --privileged=true --name pg --shm-size=1g registry.cn-hangzhou.aliyuncs.com/digoal/opensource_database:pg14_with_exts

# 进入容器
docker exec -ti pg bash

arm64

# 拉取镜像, 第一次拉取一次即可. 或者需要的时候执行, 将更新到最新镜像版本.
docker pull registry.cn-hangzhou.aliyuncs.com/digoal/opensource_database:pg14_with_exts_arm64

# 启动容器
docker run -d -it -P --cap-add=SYS_PTRACE --cap-add SYS_ADMIN --privileged=true --name pg --shm-size=1g registry.cn-hangzhou.aliyuncs.com/digoal/opensource_database:pg14_with_exts_arm64

# 进入容器
docker exec -ti pg bash

例子

首先在容器中部署mysql8最新版本, 下次把mysql直接打包到容器中.

https://dev.mysql.com/downloads/repo/apt/

cd tmp

wget https://repo.mysql.com//mysql-apt-config_0.8.28-1_all.deb
dpkg -i mysql-apt-config_0.8.28-1_all.deb
apt-get update
apt-get install -y mysql-server

root@502e0ed09ea5:/tmp# mysql --version
mysql Ver 8.0.35 for Linux on x86_64 (MySQL Community Server - GPL)

配置my.cnf

vi etc/mysql/my.cnf

# add by digoal
[mysqld]
secure_file_priv = tmp
socket = tmp/mysql.sock

启动MySQL

root@502e0ed09ea5:~# mysqld -u root -D
mysqld will log errors to var/log/mysql/error.log
mysqld is running as pid 1721

在PostgreSQL中测试1000万数据简单分析SQL, 21.4秒.

root@502e0ed09ea5:~# psql
psql (14.10 (Debian 14.10-1.pgdg110+1))
Type "help" for help.

postgres=# \timing
Timing is on.
postgres=# create unlogged table tbl (id int, info text, ts timestamp, c1 int);
CREATE TABLE
Time: 51.401 ms
postgres=# insert into tbl select id,md5(random()::text) as info,clock_timestamp()::timestamp(0) as ts, (random()*10000)::int as c1 from generate_series(1,10000000) id;
INSERT 0 10000000
Time: 15152.684 ms (00:15.153)
postgres=# show max_parallel_workers_per_gather ;
max_parallel_workers_per_gather
---------------------------------
0
(1 row)

Time: 0.506 ms
postgres=# select count(*), count(distinct id), count(distinct info), count(distinct ts), count(distinct c1) from tbl;

count | count | count | count | count
----------+----------+----------+-------+-------
10000000 | 10000000 | 10000000 | 15 | 10001
(1 row)
Time: 21414.875 ms (00:21.415)

使用PostgreSQL导出csv, 以便写入MySQL

psql -c "copy (select id,md5(random()::text) as info,clock_timestamp()::timestamp(0) as ts, (random()*10000)::int as c1 from generate_series(1,10000000) id) to '/tmp/1.csv' with (format csv, header on)"

COPY 10000000

将数据导入MySQL, 直接使用mysql 1000万记录简单分析SQL 61秒.

mysql -S tmp/mysql.sock

mysql> mysql
mysql> create table tbl (id int(10), info varchar(100), ts timestamp, c1 int(10));
mysql> load data infile '/tmp/1.csv' into table tbl fields terminated by ',' ignore 1 lines;

mysql> CREATE R 'digoal'@'localhost' IDENTIFIED BY '123456';
mysql> grant all on *.* to 'digoal'@'localhost';
mysql> grant all on mysql to 'digoal'@'localhost';

mysql> select count(*), count(distinct id), count(distinct info), count(distinct ts), count(distinct c1) from tbl;
+----------+--------------------+----------------------+--------------------+--------------------+
| count(*) | count(distinct id) | count(distinct info) | count(distinct ts) | count(distinct c1) |
+----------+--------------------+----------------------+--------------------+--------------------+
| 10000000 | 10000000 | 10000000 | 18 | 10001 |
+----------+--------------------+----------------------+--------------------+--------------------+
1 row in set (1 min 0.97 sec)

mysql> select * from tbl limit 10;
+------+----------------------------------+---------------------+------+
| id | info | ts | c1 |
+------+----------------------------------+---------------------+------+
| 1 | 2a9758cafc8fb25c1d960e770b7a8463 | 2023-12-02 01:50:23 | 518 |
| 2 | 02145c7be91aadae3bcc868fea3198d5 | 2023-12-02 01:50:23 | 9623 |
| 3 | 98c1ea502f3f58c900ccc88886a09515 | 2023-12-02 01:50:23 | 2143 |
| 4 | 413bb65f65606b3bf9b10ac03336f407 | 2023-12-02 01:50:23 | 3321 |
| 5 | d8fd444cfbcd82c3e2b6e5622ee0e439 | 2023-12-02 01:50:23 | 3750 |
| 6 | ac8709aacb3dca453e88d09dca58b83b | 2023-12-02 01:50:23 | 616 |
| 7 | 7f55f7d483a7b86e68c1f9fff7ab68a5 | 2023-12-02 01:50:23 | 1976 |
| 8 | b2ec10593c1c780c83c3801bf306d12f | 2023-12-02 01:50:23 | 5812 |
| 9 | 1c31fc5c2bcdd6fa113b589e22fb0285 | 2023-12-02 01:50:23 | 7636 |
| 10 | b05c40989b9c6fadfc84330937390052 | 2023-12-02 01:50:23 | 9923 |
+------+----------------------------------+---------------------+------+
10 rows in set (0.00 sec)

使用DuckDB mysqlscanner, 1000万记录简单分析SQL 41秒.

su - postgres
./duckdb

install 'mysql';
load 'mysql';
.timer on

ATTACH 'host=localhost r=digoal password=123456 port=0 database=mysql' AS my (TYPE mysql_scanner, READ_ONLY) ;

D my;
Run Time (s): real 0.002 r 0.000000 sys 0.000860

D show tables;
┌──────────────────────────────────────────────────────┐
│ name │
│ varchar │
├──────────────────────────────────────────────────────┤
│ columns_priv │
│ component │
│ db │
│ default_roles │
│ engine_cost │
│ func │
│ general_log │
│ global_grants │
│ gtid_executed │
│ help_category │
│ help_keyword │
│ help_relation │
│ help_topic │
│ innodb_index_stats │
│ innodb_table_stats │
│ ndb_binlog_index │
│ password_history │
│ plugin │
│ procs_priv │
│ proxies_priv │
│ replication_asynchronous_connection_failover │
│ replication_asynchronous_connection_failover_managed │
│ replication_group_configuration_version │
│ replication_group_member_actions │
│ role_edges │
│ server_cost │
│ servers │
│ slave_master_info │
│ slave_relay_log_info │
│ slave_worker_info │
│ slow_log │
│ tables_priv │
│ tbl │
│ time_zone │
│ time_zone_leap_second │
│ time_zone_name │
│ time_zone_transition │
│ time_zone_transition_type │
│ r │
├──────────────────────────────────────────────────────┤
│ 39 rows │
└──────────────────────────────────────────────────────┘
Run Time (s): real 0.011 r 0.018249 sys 0.004298

D select count(*) from tbl;
100% ▕████████████████████████████████████████████████████████████▏
┌──────────────┐
│ count_star() │
│ int64 │
├──────────────┤
│ 10000000 │
└──────────────┘
Run Time (s): real 6.570 r 0.788166 sys 0.183505

D select count(*), count(distinct id), count(distinct info), count(distinct ts), count(distinct c1) from tbl;
100% ▕████████████████████████████████████████████████████████████▏
┌──────────────┬────────────────────┬──────────────────────┬────────────────────┬────────────────────┐
│ count_star() │ count(DISTINCT id) │ count(DISTINCT info) │ count(DISTINCT ts) │ count(DISTINCT c1) │
│ int64 │ int64 │ int64 │ int64 │ int64 │
├──────────────┼────────────────────┼──────────────────────┼────────────────────┼────────────────────┤
│ 10000000 │ 10000000 │ 10000000 │ 18 │ 10001 │
└──────────────┴────────────────────┴──────────────────────┴────────────────────┴────────────────────┘
Run Time (s): real 40.862 r 17.440941 sys 1.945406

将数据导入DuckDB inmemory db, 1000万记录简单分析SQL 1.4秒.

D show databases;
┌───────────────┐
│ database_name │
│ varchar │
├───────────────┤
│ memory │
│ my │
└───────────────┘
Run Time (s): real 0.003 r 0.001269 sys 0.002332
D memory;
Run Time (s): real 0.002 r 0.000797 sys 0.000000
D create table t1 as select * from my.tbl;
100% ▕████████████████████████████████████████████████████████████▏
Run Time (s): real 35.903 r 12.694567 sys 1.437991
D DESCRIBE t1;
┌─────────────┬──────────────────────────┬─────────┬─────────┬─────────┬───────┐
│ column_name │ column_type │ null │ key │ default │ extra │
│ varchar │ varchar │ varchar │ varchar │ varchar │ int32 │
├─────────────┼──────────────────────────┼─────────┼─────────┼─────────┼───────┤
│ id │ INTEGER │ YES │ │ │ │
│ info │ VARCHAR │ YES │ │ │ │
│ ts │ TIMESTAMP WITH TIME ZONE │ YES │ │ │ │
│ c1 │ INTEGER │ YES │ │ │ │
└─────────────┴──────────────────────────┴─────────┴─────────┴─────────┴───────┘
Run Time (s): real 0.050 r 0.075262 sys 0.024439

D select count(*), count(distinct id), count(distinct info), count(distinct ts), count(distinct c1) from t1;
┌──────────────┬────────────────────┬──────────────────────┬────────────────────┬────────────────────┐
│ count_star() │ count(DISTINCT id) │ count(DISTINCT info) │ count(DISTINCT ts) │ count(DISTINCT c1) │
│ int64 │ int64 │ int64 │ int64 │ int64 │
├──────────────┼────────────────────┼──────────────────────┼────────────────────┼────────────────────┤
│ 10000000 │ 10000000 │ 10000000 │ 18 │ 10001 │
└──────────────┴────────────────────┴──────────────────────┴────────────────────┴────────────────────┘
Run Time (s): real 1.404 r 6.835519 sys 0.699056


欢迎关注我的github (https://github.com/digoal/blog) , 学习数据库不迷路.  

近期正在写公开课材料, 未来将通过视频号推出. 

相关文章

Oracle如何使用授予和撤销权限的语法和示例
Awesome Project: 探索 MatrixOrigin 云原生分布式数据库
下载丨66页PDF,云和恩墨技术通讯(2024年7月刊)
社区版oceanbase安装
Oracle 导出CSV工具-sqluldr2
ETL数据集成丨快速将MySQL数据迁移至Doris数据库

发布评论