浅谈PostgreSQL的索引

共计 8179 个字符，预计需要花费 21 分钟才能阅读完成。

当表数据量越来越大时查询速度会下降，在表的条件字段上使用索引，快速定位到可能满足条件的记录，不需要遍历所有记录。

create table t(id int, info text);
insert into t select generate_series(1,10000),'lottu'||generate_series(1,10000);
create table t1 as select * from t;
create table t2 as select * from t;
create index ind_t2_id on t2(id);

lottu=# analyze t1;
ANALYZE
lottu=# analyze t2;
ANALYZE
# 没有索引
lottu=# explain (analyze,buffers,verbose) select * from t1 where id < 10;
                                             QUERY PLAN                                              
-----------------------------------------------------------------------------------------------------
 Seq Scan on lottu.t1  (cost=0.00..180.00 rows=9 width=13) (actual time=0.073..5.650 rows=9 loops=1)
   Output: id, info
   Filter: (t1.id < 10)
   Rows Removed by Filter: 9991
   Buffers: shared hit=55
 Planning time: 25.904 ms
 Execution time: 5.741 ms
(7 rows)
# 有索引
lottu=# explain (analyze,verbose,buffers) select * from t2 where id < 10;
                                                     QUERY PLAN                                                      
---------------------------------------------------------------------------------------------------------------------
 Index Scan using ind_t2_id on lottu.t2  (cost=0.29..8.44 rows=9 width=13) (actual time=0.008..0.014 rows=9 loops=1)
   Output: id, info
   Index Cond: (t2.id < 10)
   Buffers: shared hit=3
 Planning time: 0.400 ms
 Execution time: 0.052 ms
(6 rows)

# 在这个案例中：执行同一条 SQL。t2 有索引的执行数据是 0.052 ms；t1 没有索引的是：5.741 ms;

索引本身就是有序的。

# 没有索引
lottu=# explain (analyze,verbose,buffers) select * from t1 where id > 2 order by id;
                                                   QUERY PLAN                                                    
-----------------------------------------------------------------------------------------------------------------
Sort  (cost=844.31..869.31 rows=9999 width=13) (actual time=8.737..11.995 rows=9998 loops=1)
   Output: id, info
   Sort Key: t1.id
   Sort Method: quicksort  Memory: 853kB
   Buffers: shared hit=55
   ->  Seq Scan on lottu.t1  (cost=0.00..180.00 rows=9999 width=13) (actual time=0.038..5.133 rows=9998 loops=1)
         Output: id, info
         Filter: (t1.id > 2)
         Rows Removed by Filter: 2
         Buffers: shared hit=55
 Planning time: 0.116 ms
 Execution time: 15.205 ms
(12 rows)
 #有索引
lottu=# explain (analyze,verbose,buffers) select * from t2 where id > 2 order by id;
                                                         QUERY PLAN                                                          
-----------------------------------------------------------------------------------------------------------------------------
 Index Scan using ind_t2_id on lottu.t2  (cost=0.29..353.27 rows=9999 width=13) (actual time=0.030..5.304 rows=9998 loops=1)
   Output: id, info
   Index Cond: (t2.id > 2)
   Buffers: shared hit=84
 Planning time: 0.295 ms
 Execution time: 7.027 ms
(6 rows)

# 在这个案例中：执行同一条 SQL。

t2 有索引的执行数据是 7.027 ms；t1 没有索引的是：15.205 ms;
t1 没有索引执行还占用了 Memory: 853kB。

索引的扫描方式有 3 种

先查索引找到匹配记录的 ctid，再通过 ctid 查堆表

先查索引找到匹配记录的 ctid 集合，把 ctid 通过 bitmap 做集合运算和排序后再查堆表

如果索引字段中包含了所有返回字段，对可见性映射 (vm)中全为可见的数据块，不查堆表直接返回索引中的值。

这里谈谈 Indexscan 扫描方式和 Indexonlyscan 扫描方式
对这两种扫描方式区别；借用 Oracle 中索引扫描方式来讲；Indexscan 扫描方式会产生回表读。根据上面解释来说；Indexscan 扫描方式：查完索引之后还需要查表。Indexonlyscan 扫描方式只需要查索引。也就是说：Indexonlyscan 扫描方式要优于 Indexscan 扫描方式？我们来看看

现有表 t；在字段 id 上面建来 ind_t_id 索引
1. t 表没有 VM 文件。lottu=# \d+ t
                           Table "lottu.t"
 Column |  Type   | Modifiers | Storage  | Stats target | Description 
--------+---------+-----------+----------+--------------+-------------
 id     | integer |           | plain    |              | 
 info   | text    |           | extended |              | 
Indexes:
    "ind_t_id" btree (id)

lottu=# explain (analyze,buffers,verbose) select id from t where id < 10;
                                                      QUERY PLAN                                                       
-----------------------------------------------------------------------------------------------------------------------
 Index Only Scan using ind_t_id on lottu.t  (cost=0.29..8.44 rows=9 width=4) (actual time=0.009..0.015 rows=9 loops=1)
   Output: id
   Index Cond: (t.id < 10)
   Heap Fetches: 9
   Buffers: shared hit=3
 Planning time: 0.177 ms
 Execution time: 0.050 ms
(7 rows)
#人为更改执行计划
lottu=# set enable_indexonlyscan = off;
SET
lottu=# explain (analyze,buffers,verbose) select id from t where id < 10;
                                                    QUERY PLAN                                                    
------------------------------------------------------------------------------------------------------------------
 Index Scan using ind_t_id on lottu.t  (cost=0.29..8.44 rows=9 width=4) (actual time=0.008..0.014 rows=9 loops=1)
   Output: id
   Index Cond: (t.id < 10)
   Buffers: shared hit=3
 Planning time: 0.188 ms
 Execution time: 0.050 ms
(6 rows)
# 可以发现两者几乎没有差异；唯一不同的是 Indexonlyscan 扫描方式存在扫描的 Heap Fetches 时间。这个时间是不在 Execution time 里面的。2. t 表有 VM 文件
lottu=# delete from t where id >200 and id < 500;
DELETE 299
lottu=# vacuum t;
VACUUM
lottu=# analyze t;
ANALYZE
lottu=# explain (analyze,buffers,verbose) select id from t where id < 10;
                                                      QUERY PLAN                                                       
-----------------------------------------------------------------------------------------------------------------------
 Index Only Scan using ind_t_id on lottu.t  (cost=0.29..4.44 rows=9 width=4) (actual time=0.008..0.012 rows=9 loops=1)
   Output: id
   Index Cond: (t.id < 10)
   Heap Fetches: 0
   Buffers: shared hit=3
 Planning time: 0.174 ms
 Execution time: 0.048 ms
(7 rows)

lottu=# set enable_indexonlyscan = off;
SET
lottu=# explain (analyze,buffers,verbose) select id from t where id < 10;
                                                    QUERY PLAN                                                    
------------------------------------------------------------------------------------------------------------------
 Index Scan using ind_t_id on lottu.t  (cost=0.29..8.44 rows=9 width=4) (actual time=0.012..0.022 rows=9 loops=1)
   Output: id
   Index Cond: (t.id < 10)
   Buffers: shared hit=3
 Planning time: 0.179 ms
 Execution time: 0.077 ms
(6 rows)

总结：

Index Only Scan 在没有 VM 文件的情况下, 速度比 Index Scan 要慢, 因为要扫描所有的 Heap page。差异几乎不大。
Index Only Scan 存在 VM 文件的情况下，是��比 Index Scan 要快。

知识点 1：

VM 文件：称为可见性映射文件；该文件存在表示：该数据块没有需要清理的行。即已经做了 vaccum 操作。

知识点 2：

人为选择执行计划。可设置 enable_xxx 参数有

enable_bitmapscan
enable_hashagg
enable_hashjoin
enable_indexonlyscan
enable_indexscan
enable_material
enable_mergejoin
enable_nestloop
enable_seqscan
enable_sort
enable_tidscan

参考文献

参考德哥：《PostgreSQL 性能优化培训 3 DAY.pdf》
https://www.postgresql.org/docs/9.6/static/runtime-config-query.html

PostgreSQL 支持索引类型有: B-tree, Hash, GiST, SP-GiST, GIN and BRIN。

postgresql—-Btree 索引:http://www.cnblogs.com/alianbog/p/5621749.html
postgresql—-hash 索引：一般只用于简单等值查询。不常用。
postgresql—-Gist 索引:http://www.cnblogs.com/alianbog/p/5628543.html

创建索引语法：

lottu=# \h create index
Command:     CREATE INDEX
Description: define a new index
Syntax:
CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [IF NOT EXISTS ] name ] ON table_name [ USING method ]
    ({ column_name | (expression) } [ COLLATE collation ] [ opclass ] [ ASC | DESC ] [ NULLS {FIRST | LAST} ] [, ...] )
    [ WITH (storage_parameter = value [, ... ] ) ]
    [ TABLESPACE tablespace_name ]
    [ WHERE predicate ]
接下来我们以 t 表为例。1. 关键字【UNIQUE】#创建唯一索引；主键就是一种唯一索引
CREATE UNIQUE INDEX ind_t_id_1 on t (id);
2. 关键字【CONCURRENTLY】# 这是并发创建索引。跟 oracle 的 online 创建索引作用是一样的。创建索引过程中；不会阻塞表更新，插入，删除操作。当然创建的时间就会很漫长。CREATE INDEX CONCURRENTLY ind_t_id_2 on t (id);
3. 关键字【IF NOT EXISTS】#用该命令是用于确认索引名是否存在。若存在；也不会报错。CREATE INDEX IF NOT EXISTS ind_t_id_3 on t (id);
4. 关键字【USING】# 创建哪种类型的索引。默认是 B -tree。CREATE INDEX ind_t_id_4 on t using btree (id);
5 关键字【[ ASC | DESC ] [ NULLS {FIRST | LAST]】# 创建索引是采用降序还是升序。若字段存在 null 值，是把 null 值放在前面还是最后：例如采用降序，null 放在前面。CREATE INDEX ind_t_id_5 on t (id desc nulls first)
6. 关键字【WITH (storage_parameter = value)】#索引的填充因子设为。例如创建索引的填充因子设为 75
CREATE INDEX ind_t_id_6 on t (id) with (fillfactor = 75);
7. 关键字【TABLESPACE】#是把索引创建在哪个表空间。CREATE INDEX ind_t_id_7 on t (id) TABLESPACE tsp_lottu;
8. 关键字【WHERE】#只在自己感兴趣的那部分数据上创建索引，而不是对每一行数据都创建索引，此种方式创建索引就需要使用 WHERE 条件了。CREATE INDEX ind_t_id_8 on t (id) WHERE id < 1000;

修改索引语法

lottu=# \h alter index
Command:     ALTER INDEX
Description: change the definition of an index
Syntax:
#把索引重新命名
ALTER INDEX [ IF EXISTS ] name RENAME TO new_name
#把索引迁移表空间
ALTER INDEX [ IF EXISTS ] name SET TABLESPACE tablespace_name
#把索引重设置填充因子
ALTER INDEX [ IF EXISTS ] name SET (storage_parameter = value [, ... ] )
#把索引的填充因子设置为默认值
ALTER INDEX [ IF EXISTS ] name RESET (storage_parameter [, ... ] )
#把表空间 TSP1 中索引迁移到新表空间
ALTER INDEX ALL IN TABLESPACE name [ OWNED BY role_name [, ... ] ]
    SET TABLESPACE new_tablespace [ NOWAIT ]

删除索引语法

lottu=# \h drop index
Command:     DROP INDEX
Description: remove an index
Syntax:
DROP INDEX [ CONCURRENTLY ] [ IF EXISTS ] name [, ...] [ CASCADE | RESTRICT ]

索引能带来加快对表中记录的查询，排序，以及唯一约束的作用。索引也是有代价

索引需要增加数据库的存储空间。
在表记录执行插入，更新，删除操作。索引也要更新。

select pg_size_pretty(pg_relation_size('ind_t_id'));

--通过 pg_stat_user_indexes.idx_scan 可检查利用索引进行扫描的次数；这样可以确认那些索引可以清理掉。
select idx_scan from pg_stat_user_indexes where indexrelname = 'ind_t_id';

--如果一个表经过频繁更新之后，索引性能不好；需要重建索引。
lottu=# select pg_size_pretty(pg_relation_size('ind_t_id_1')); 
 pg_size_pretty 
----------------
 2200 kB
(1 row)

lottu=# delete from t where id > 1000;
DELETE 99000

lottu=# analyze t;
ANALYZE
lottu=# select pg_size_pretty(pg_relation_size('ind_t_id_1')); 
 pg_size_pretty 
----------------
 2200 kB
 
lottu=# insert into t select generate_series(2000,100000),'lottu';
INSERT 0 98001

lottu=# select pg_size_pretty(pg_relation_size('ind_t_id_1')); 
 pg_size_pretty 
----------------
 4336 kB
(1 row)

lottu=# vacuum full t;
VACUUM

lottu=# select pg_size_pretty(pg_relation_size('ind_t_id_1')); 
 pg_size_pretty 
----------------
 2176 kB
 
重建方法：1. reindex：reindex 不支持并行重建【CONCURRENTLY】; 索引会锁表；会进行阻塞。2. vacuum full; 对表进行重构；索引也会重建；同样也会锁表。3. 创建一个新索引(索引名不同)；再删除旧索引。