共计 8101 个字符,预计需要花费 21 分钟才能阅读完成。
导读 | 时常都会有人问 MySQL 分区表要如何使用,MySQL 分区表的和 Oracle 的差远了,该不该用 MySQL 分区表。 |
啰哩八嗦
其实该不该用,我也不能给予很好的建议。还是那句话,觉得适合自己才是最好的。觉得自己可以搞定分区表那就用。
我多虑了
- 好很好的使用分区表就需要做好对开发人员培训的准备,让他们知道要怎么样才能很好的使用分区表。
- 最好需要有一个自动化的计划,定时的自动处理分区的问题。
- DELETE 删除数据不会释放磁盘空间,DROP PARITION 会释放磁盘空间,这样节省了空间的同时,也不会因为 delete 标记数据过多带来性能问题。
开干了
创建 MySQL 分区数据
DROP TABLE ord_order; | |
-- 创建订单分区表 | |
CREATE TABLE ord_order( | |
order_id BIGINT NOT NULL AUTO_INCREMENT COMMENT '订单 ID', | |
user_id INT NOT NULL COMMENT '用户 ID', | |
goods_id INT NOT NULL COMMENT '商品 ID', | |
order_price INT NOT NULL DEFAULT 0 COMMENT '订单实际价格 (分)', | |
create_time DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', | |
PRIMARY KEY(order_id, create_time) | |
) | |
PARTITION BY LIST (YEAR(create_time)*100 + MONTH(create_time)) | |
(PARTITION p201601 VALUES IN (201601), | |
PARTITION p201602 VALUES IN (201602), | |
PARTITION p201603 VALUES IN (201603), | |
PARTITION p201604 VALUES IN (201604), | |
PARTITION p201605 VALUES IN (201605), | |
PARTITION p201606 VALUES IN (201606), | |
PARTITION p201607 VALUES IN (201607), | |
PARTITION p201608 VALUES IN (201608), | |
PARTITION p201609 VALUES IN (201609), | |
PARTITION p201610 VALUES IN (201610), | |
PARTITION p201611 VALUES IN (201611), | |
PARTITION p201612 VALUES IN (201612) | |
); | |
-- 插入相关数据 | |
INSERT INTO ord_order VALUES | |
(NULL, 10000001, 11111111, 1000, '2016-01-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-01-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-01-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-01-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-01-13 05:00:50'), | |
(NULL, 10000001, 11111111, 1000, '2016-02-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-02-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-02-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-02-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-02-13 05:00:50'), | |
(NULL, 10000001, 11111111, 1000, '2016-03-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-03-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-03-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-03-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-03-13 05:00:50'), | |
(NULL, 10000001, 11111111, 1000, '2016-04-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-04-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-04-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-04-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-04-13 05:00:50'), | |
(NULL, 10000001, 11111111, 1000, '2016-05-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-05-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-05-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-05-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-05-13 05:00:50'), | |
(NULL, 10000001, 11111111, 1000, '2016-06-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-06-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-06-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-06-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-06-13 05:00:50'), | |
(NULL, 10000001, 11111111, 1000, '2016-07-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-07-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-07-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-07-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-07-13 05:00:50'), | |
(NULL, 10000001, 11111111, 1000, '2016-08-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-08-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-08-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-08-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-08-13 05:00:50'), | |
(NULL, 10000001, 11111111, 1000, '2016-09-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-09-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-09-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-09-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-09-13 05:00:50'), | |
(NULL, 10000001, 11111111, 1000, '2016-10-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-10-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-10-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-10-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-10-13 05:00:50'), | |
(NULL, 10000001, 11111111, 1000, '2016-11-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-11-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-11-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-11-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-11-13 05:00:50'), | |
(NULL, 10000001, 11111111, 1000, '2016-12-13 01:00:10'), | |
(NULL, 10000001, 11111112, 2000, '2016-12-13 02:00:20'), | |
(NULL, 10000001, 11111113, 3000, '2016-12-13 03:00:30'), | |
(NULL, 10000001, 11111114, 4000, '2016-12-13 04:00:40'), | |
(NULL, 10000001, 11111115, 5000, '2016-12-13 05:00:50'); | |
-- 查看分区 p201601 数据 | |
SELECT * FROM ord_order PARTITION(p201601); | |
-- 组合成的 row key | |
SELECT CONCAT(user_id, 10000000000-UNIX_TIMESTAMP(create_time), goods_id) | |
FROM ord_order PARTITION(p201601); |
结合 HBase 咯
创建 HBase 表 ord_order
由于版本兼容的问题,这边我需要先创建好 HBase 对应的表。不然会报不能自动创建 column family 的错误。
使用 hbase shell 创建 ord_order 表
hbase(main):033:0> create 'ord_order', {NAME => 'cf1'}
使用 Sqoop 将 MySQL 的 ord_order 表的 p201601 分区的数据导入 HBase 表。
/usr/local/sqoop/bin/sqoop import \ | |
--connect jdbc:mysql://192.168.137.11:3306/test \ | |
--username HH \ | |
--password oracle \ | |
--query 'SELECT CONCAT(user_id, 10000000000-UNIX_TIMESTAMP(create_time), goods_id) AS order_id, order_price, create_time FROM ord_order PARTITION(p201601) WHERE $CONDITIONS' \ | |
--hbase-table ord_order \ | |
--hbase-create-table \ | |
--hbase-row-key order_id \ | |
--split-by order_id \ | |
--column-family cf1 \ | |
-m 1 |
导入成功后就可以在 MySQL 上面将相关分区删除,并且创建之后需要的分区
ALTER TABLE ord_order | |
ADD PARTITION (PARTITION p201701 VALUES IN (201701)); | |
ALTER TABLE ord_order DROP PARTITION p201601; |
查看 Hbase 中导入的数据
hbase(main):001:0> scan 'ord_order' | |
ROW COLUMN+CELL | |
10000001854736755011111115 column=cf1:create_time, timestamp=1479224942888, value=2016-01-13 05:00:50.0 | |
10000001854736755011111115 column=cf1:order_price, timestamp=1479224942888, value=5000 | |
10000001854737116011111114 column=cf1:create_time, timestamp=1479224942888, value=2016-01-13 04:00:40.0 | |
10000001854737116011111114 column=cf1:order_price, timestamp=1479224942888, value=4000 | |
10000001854737477011111113 column=cf1:create_time, timestamp=1479224942888, value=2016-01-13 03:00:30.0 | |
10000001854737477011111113 column=cf1:order_price, timestamp=1479224942888, value=3000 | |
10000001854737838011111112 column=cf1:create_time, timestamp=1479224942888, value=2016-01-13 02:00:20.0 | |
10000001854737838011111112 column=cf1:order_price, timestamp=1479224942888, value=2000 | |
10000001854738199011111111 column=cf1:create_time, timestamp=1479224942888, value=2016-01-13 01:00:10.0 | |
10000001854738199011111111 column=cf1:order_price, timestamp=1479224942888, value=1000 | |
5 row(s) in 0.5390 seconds |
ROW KEY 设计详解
HBase 中的 row key 为 user_id, 10000000000-UNIX_TIMESTAMP(create_time), goods_id 3 个字段组成。
这边值得注意的是 10000000000-UNIX_TIMESTAMP(create_time), 这样设计的原因是为了让订单能按时间的倒序排列, 这样就符合 越新的数据越先显示
如: 现在需要对用户 10000001 的订单进行分页, 每页两条数据, 并且按时间的倒序排序 (最新订单最先显示)
hbase(main):003:0> scan 'ord_order', {COLUMNS=>['cf1:order_price'], ROWPREFIXFILTER=>'10000001', LIMIT=>2} | |
ROW COLUMN+CELL | |
10000001854736755011111115 column=cf1:order_price, timestamp=1479224942888, value=5000 | |
10000001854737116011111114 column=cf1:order_price, timestamp=1479224942888, value=4000 |
点击下一页的数据:
hbase(main):004:0> scan 'ord_order', {COLUMNS=>['cf1:order_price'], LIMIT=>3, STARTROW=>'10000001854737116011111114'} | |
ROW COLUMN+CELL | |
10000001854737116011111114 column=cf1:order_price, timestamp=1479224942888, value=4000 | |
10000001854737477011111113 column=cf1:order_price, timestamp=1479224942888, value=3000 | |
10000001854737838011111112 column=cf1:order_price, timestamp=1479224942888, value=2000 | |
3 row(s) in 0.0260 seconds | |
上面获得了三行数据,在实际展现的时候去除第一行就好了,实际展示如下: | |
10000001854737477011111113 column=cf1:order_price, timestamp=1479224942888, value=3000 | |
10000001854737838011111112 column=cf1:order_price, timestamp=1479224942888, value=2000 |
点击上一页
hbase(main):008:0> scan 'ord_order', {COLUMNS=>['cf1:order_price'], LIMIT=>3, STARTROW=>'10000001854737477011111113', REVERSED=>true} | |
ROW COLUMN+CELL | |
10000001854737477011111113 column=cf1:order_price, timestamp=1479224942888, value=3000 | |
10000001854737116011111114 column=cf1:order_price, timestamp=1479224942888, value=4000 | |
10000001854736755011111115 column=cf1:order_price, timestamp=1479224942888, value=5000 | |
3 row(s) in 0.0640 seconds | |
上面同样获得了三条数据,我们需要去除第一行,让后按数据集合倒序显示 | |
10000001854737116011111114 column=cf1:order_price, timestamp=1479224942888, value=4000 | |
10000001854736755011111115 column=cf1:order_price, timestamp=1479224942888, value=5000 | |
↓↓↓↓↓ 上面两行是集合数据 下面两行数倒序遍历集合的数据 (也是最终显示的数据) | |
10000001854736755011111115 column=cf1:order_price, timestamp=1479224942888, value=5000 | |
10000001854737116011111114 column=cf1:order_price, timestamp=1479224942888, value=4000 |
总结
对 HBase 的设计和使用是门学问,在实际使用中,一般需要和开发产品一起讨论如何设计 row key 比较好。当然,一般多多阅读过来人的经验往往也能够解决很多问题。因为你遇到的问题,别人可能也会遇到。
正文完
星哥玩云-微信公众号
