Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
78 views

External Merge Sort Python

This document discusses an external sorting algorithm that uses an invariant heap data structure. It defines an InvariantHeapNode class to represent nodes in the heap with an element and file. An ExternalSort class splits a large file into smaller sorted runs stored in temporary files. An InvariantHeap class implements a min heap to efficiently merge the sorted runs by repeatedly extracting and replacing the minimum node. The merge_files function uses this class to merge all runs into a single sorted list.

Uploaded by

Max Zhukousky
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
78 views

External Merge Sort Python

This document discusses an external sorting algorithm that uses an invariant heap data structure. It defines an InvariantHeapNode class to represent nodes in the heap with an element and file. An ExternalSort class splits a large file into smaller sorted runs stored in temporary files. An InvariantHeap class implements a min heap to efficiently merge the sorted runs by repeatedly extracting and replacing the minimum node. The merge_files function uses this class to merge all runs into a single sorted list.

Uploaded by

Max Zhukousky
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

# based on the idea of invariant heap

import tempfile
import sys
from typing import List

# class InvariantHeapNode:
# def __init__(self, element, i, j):
# self.element = element
# self.i = i
# self.j = j

class InvariantHeapNode:
def __init__(self, element, file):
self.element = element
self.file = file

class ExternalSort():
def __init__(self):
self.sorted_temp_files = []

def make_runs(self, filename, files_size):


temp_list = []
sorted_temp_files = []
file_size = 0
with open(filename) as f:
while True:
number = f.readline()
if not number:
break
temp_list.append(number)
file_size += 1
if file_size % files_size == 0:
temp_list = sorted(temp_list, key=lambda no: int(no.strip()))
with tempfile.NamedTemporaryFile('w') as temp_file:
temp_file.writelines(temp_list)
temp_file.seek(0)
sorted_temp_files.append(temp_file)
temp_list.clear()

class InvariantHeap():
def __init__(self, my_heap: List[InvariantHeapNode]):
super().__init__()
self.my_heap = my_heap
size = len(my_heap)
middle = int((size - 1) / 2)
while middle >= 0:
self.heapify(middle)
middle -= 1

def heapify(self, i):


l = 2*i + 1
r = 2*i + 2
smallest = i
size = len(self.my_heap)
if l < size and self.my_heap[l].element < self.my_heap[i].element:
smallest = l
if r < size and self.my_heap[r].element < self.my_heap[smallest].element:
smallest = r
if smallest != i:
self.my_heap[smallest], self.my_heap[i] = self.my_heap[i],
self.my_heap[smallest]
self.heapify(smallest)

def get_min(self):
if len(self.my_heap) < 1:
return f'Underflow {None}'
return self.my_heap[0] # first element is min in invariant heap

def replace_min(self, root):


self.my_heap[0] = root
self.heapify(0)

def merge_files():
array = []
result = []
obj = ExternalSort()
for temp_file in obj.sorted_temp_files:
node = temp_file.readlines().strip()
array.append(InvariantHeapNode(node, temp_file))

inv_heap = InvariantHeap(array)
while True:
root = inv_heap.get_min()
if root.element == sys.maxsize:
break
result.append(root)
file = root.file
element = file.readline().strip()
if not element:
element = sys.maxsize
else:
element = int(element)
inv_heap.replace_min(root)
return result

# def merge_k_runs(runs: Matrix, k: int):


# array = []
# result_size = 0
# for i in range(len(runs)):
# node = InvariantHeapNode(runs[i][0], i, 1)
# array.append(node)
# result_size += len(runs[i])
#
# inv_heap = InvariantHeap(array, k)
# result = [0]*result_size
# for i in range(result_size):
# root = inv_heap.get_min()
# result[i] = root.element
# if root.j < len(runs[root.i]):
# root.element = runs[root.i][root.j]
# root.j += 1
# else:
# root.element = sys.maxsize
# inv_heap.replace_min(root)
# return result

def main():
# runs_num = 10
# runs_size = 1000
#
# with open("/home/max/input.txt") as f:
# num_array = [int(x) for x in f.writelines().split()]

# runs = [[3, 2, 1], [9, 8, 6], [5, 2, 1]]


# runs = [
# [2, 6, 12, 34],
# [1, 9, 20, 1000],
# [23, 34, 90, 2000]
# ]
# a = merge_k_runs(runs, len(runs))
# print(a)

filename = '/home/max/ext_sort.txt'
files_size = 5
obj = ExternalSort()
obj.make_runs(filename, files_size)
merge_files()

if __name__ == '__main__':
main()

You might also like