共计 2492 个字符,预计需要花费 7 分钟才能阅读完成。
导读 | 对比两个文件相似度,python 中可通过 difflib.SequenceMatcher/ssdeep/python_mmdt/tlsh 实现, 在大量需要对比,且文件较大时,需要更高的效率,可以考虑模糊哈希,本文就来和大家详细聊聊 |
对比两个文件相似度,python 中可通过 difflib.SequenceMatcher/ssdeep/python_mmdt/tlsh 实现,在大量需要对比,且文件较大时,需要更高的效率,可以考虑模糊哈希(fuzzy hash),如 ssdeep/python_mmdt
测试过程发现:
测试环境:
OS:ubuntu20.04
python:3.8.10
py-tlsh==4.7.2
python-mmdt==0.3.1
ssdeep==3.4
# -*- coding: utf-8 -*- | |
import ssdeep | |
import time | |
from python_mmdt.mmdt.mmdt import MMDT | |
from difflib import SequenceMatcher | |
def difflib_test(file1,file2): | |
start_time = time.time() | |
with open(file1,'rb') as f: | |
s1 = f.read() | |
with open(file2,'rb') as f: | |
s2 = f.read() | |
match_obj = SequenceMatcher(None,s1,s2) | |
print("difflib match:",match_obj.ratio()) | |
end_time = time.time() | |
print('difflib_test cost:',end_time-start_time) | |
def mmdt_test(file1,file2): | |
start_time = time.time() | |
mmdt=MMDT() | |
r1 = mmdt.mmdt_hash(file1) | |
print(r1) | |
r2 = mmdt.mmdt_hash_streaming(file2) | |
print(r2) | |
# sim1 = mmdt.mmdt_compare(file1, file2) | |
# print("mmdt match:",sim1) | |
sim2 = mmdt.mmdt_compare_hash(r1, r2) | |
print("mmdt match:",sim2) | |
end_time = time.time() | |
print('mmdt_test cost:',end_time-start_time) | |
def ssdeep_test(file1,file2): | |
start_time = time.time() | |
sig1=ssdeep.hash_from_file(file1) | |
sig2=ssdeep.hash_from_file(file2) | |
print(sig1) | |
print(sig2) | |
print("ssdeep match:",ssdeep.compare(sig1,sig2)) | |
end_time = time.time() | |
print('ssdeep_test cost:',end_time-start_time) | |
if __name__ == '__main__': | |
start_time = time.time() | |
file1='/root/test/fstab' | |
file2='/root/test/fstab2' | |
# file1 = '/root/test/initrd.img-5.4.0-125-generic' | |
# file2 = '/root/test/initrd.img-5.4.0-135-generic' | |
mmdt_test(file1,file2) | |
ssdeep_test(file1,file2) | |
difflib_test(file1,file2) | |
end_time = time.time() | |
print('总执行时间:',end_time-start_time) |
下面给出对比小文件 / 大文件效果:
测试 tlsh
import tlsh | |
import time | |
def tlsh_test(file1,file2): | |
start_time = time.time() | |
with open(file1,'rb') as f: | |
s1 = tlsh.hash(f.read()) | |
with open(file2,'rb') as f: | |
s2 = tlsh.hash(f.read()) | |
match_obj = tlsh.diff(s1,s2) | |
print("tlsh match:",match_obj) | |
end_time = time.time() | |
print('difflib_test cost:',end_time-start_time) | |
if __name__ == '__main__': | |
start_time = time.time() | |
# file1='/root/test/fstab' | |
# file2='/root/test/fstab2' | |
file1 = '/root/test/initrd.img-5.4.0-125-generic' | |
file2 = '/root/test/initrd.img-5.4.0-135-generic' | |
tlsh_test(file1,file2) | |
end_time = time.time() | |
print('总执行时间:',end_time-start_time) |
对比小文件 / 大文件
到此这篇关于 Python 利用模糊哈希实现对比文件相似度的文章就介绍到这了
正文完
星哥玩云-微信公众号
