Efficient Finding of Duplicated Files

Uwaga! Informacje na tej stronie mają ponad 6 lat. Nadal je udostępniam, ale prawdopodobnie nie odzwierciedlają one mojej aktualnej wiedzy ani przekonań.

Tue
14
Jul 2009

I've recently wrote a tool for finding duplicated files in a given directory. I wanted to do it efficiently and here is my algoritm.

My basic idea is to first recursively enumerate all files in the directory and its subdirectories and sort their descriptions by file size. Making it this way, we can then iterate through this list and for each file look ahead to see how many other files have same size. If two files have different sizes, they are obviously not equal in terms of their content. So if there is only one file with particular size, it can just be skipped. If there are two files with same size, we must just compare them by content.

If there are many files with same size, things become more complicated, as any possible combinations of them can turn out to be identical. My solution is to compare two files by a "header" (lets say - first 1024 bytes) and if they are equal - by MD5 checksum of their whole content. Checksum of each file is lazy evaluated, so it's calculated only once, the first time its needed.

As I iterate through the sorted file list, I mark reported files not to process them again. I can do it because I ensure that if a file is reported, all other identical files are also already reported. In my real code I do the same with files I encounter errors with, but here I wanted to simplify the code.

Here is my data structure. I'm going to recursively enumerate all files and store information about them in a sequence sorted by file size.

struct FILE_DESC
{
  CString Path;
  LARGE_INTEGER Size;
  FILETIME LastWriteTime;
  mutable SMD5Sum MD5;
  mutable bool MD5Calculated, Reported;

  bool operator < (const FILE_DESC &fd) const
  {
    return Size.QuadPart < fd.Size.QuadPart;
  }
};
typedef std::multiset<FILE_DESC> FILE_DESC_SET;

My main procedure is splitted into two parts:

CString Dir = ...;
FILE_DESC_SET set;

ListDir(set, Dir);
CompareFiles(set);

The ListDir procedure recursively enumerates all files and subdirectories in a given directory, adding information about every file to the set.

void ListDir(FILE_DESC_SET &set, const CString &dir)
{
  CString findPath, path;
  // findPath = dir + "\\*"
  WIN32_FIND_DATA findData;
  HANDLE findHandle = FindFirstFile(findPath, &findData);
  if (findHandle != INVALID_HANDLE_VALUE)
  {
    FILE_DESC fileDesc;
    fileDesc.MD5Calculated = false;
    fileDesc.Reported = false;
    do
    {
      if (strcmp(findData.cFileName, ".") != 0
        && strcmp(findData.cFileName, "..") != 0
        && (findData.dwFileAttributes & FILE_ATTRIBUTE_HIDDEN) == 0 )
      {
        // path = dir + "\\" + findData.cFileName
        if ((findData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0)
          ListDir(set, path); // Recursion
        else if (findData.nFileSizeHigh > 0 || findData.nFileSizeLow > 0)
        {
          fileDesc.Path = path;
          fileDesc.Size.HighPart = (LONG)findData.nFileSizeHigh;
          fileDesc.Size.LowPart = findData.nFileSizeLow;
          fileDesc.LastWriteTime = findData.ftLastWriteTime;
          set.insert(fileDesc);
        }
      }
    }
    while (FindNextFile(findHandle, &findData) != 0);
    FindClose(findHandle);
  }
}

Now, when we have the list of all files sorted in ascending size, we can execute the main algorithm. It's quite complicated, so I will show its pseudocode first.

foreach it1 in set:
{
  if it1 is already Reported:
    Skip this file.
  sameSizeFileCount = number of subsequent files in set,
    which have same file size and are not already Reported.
  if sameSizeFileCount = 0:
    // There are no other files with that size - it's surely unique.
    Skip this file.
  if sameSizeFileCount = 1:
  {
    // There are two files with same size - we must compare them by content.
    it2 = next file in set after it1, which is not already Reported.
    Compare files it1 and it2 by content.
    if they are equal:
      Report it1 and it2 as duplicated files.
  }
  else
  {
    // There are more than two files with same content...
    Open file it1.
    fileSize1 = size of file it1.
    Buf1 = read up to 1024 first bytes from file it1.
    foreach it2 in set after it1,
      which have same size and are not already Reported:
    {
      Open file it2.
      Buf2 = read up to 1024 first bytes from file it2.
      if data in Buf1 and Buf2 equal:
      {
        if fileSize1 <= 1024:
          // Files are not bigger than these first 1024 bytes,
          // so if Buf1 is equal to Buf2, they are whole identical.
          filesIdentical = true.
        else
        {
          // Files are bigger than these 1024 bytes,
          // so we have to compare them further by MD5 checksum.
          if MD5 checksum for file it1 is not already calculated:
            // First 1024 bytes of file it1 are in Buf1.
            // Rest of bytes can be read from already opened file it1.
            Calculate and save MD5 checksum for file it1.
          if MD5 checksum for file it2 is not already calculated:
            // First 1024 bytes of file it2 are in Buf2.
            // Rest of bytes can be read from already opened file it2.
            Calculate and save MD5 checksum for file it2.
          filesIdentical = MD5 of it1 = MD5 of it2.
        }
        
        if filesIdentical:
        {
          if it1 is not already Reported:
          {
            Report it1 file as first in dupliacted files group.
            Mark it1 file as Reported.
          }
          
          Report it2 file as next in dupliacted files group.
          Mark it2 file as Reported.
        }
      }
    }
  }
}

And here is the (almost) real code:

void CompareFiles(FILE_DESC_SET &set)
{
  FILE_DESC_SET::iterator it2;
  size_t it_i = 0;
  for (FILE_DESC_SET::iterator it1 = set.begin(); it1 != set.end(); ++it1, it_i++)
  {
    if (it1->Reported) continue;

    unsigned sameSizeFileCount = 0;
    for (it2 = it1, ++it2;
      it2 != set.end() && it2->Size.QuadPart == it1->Size.QuadPart && !it2->Reported &&;
      ++it2)
    {
      sameSizeFileCount++;
    }

    if (sameSizeFileCount == 0)
      continue;

    if (sameSizeFileCount == 1)
    {
      it2 = it1;
      do
      {
        ++it2;
      }
      while (it2 != set.end() && it2->Reported);
      
      assert(it2 != set.end() && it2->Size.QuadPart == it1->Size.QuadPart);

      if (FilesContentEqual(*it1, *it2))
      {
        // Report file described by it1 as first in dupliacted files group
        // Report file described by it2 as next in duplicated files group
      }
    }
    else
    {
      HANDLE hFile1 = CreateFile(it1->Path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
      LARGE_INTEGER fileSize1;
      fileSize1.LowPart = GetFileSize(hFile1, (LPDWORD)&fileSize1.HighPart);
      DWORD beginBlockSize = (DWORD)std::min<__int64>(fileSize1.QuadPart, (__int64)BUF_SIZE);
      DWORD bytesRead;
      ReadFile(hFile1, Buf1, beginBlockSize, &bytesRead, NULL);
      assert(bytesRead == beginBlockSize);

      for (it2 = it1, ++it2; it2 != set.end() && it2->Size.QuadPart == it1->Size.QuadPart && !it2->Reported; ++it2)
      {
        HANDLE hFile2 = CreateFile(it2->Path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
        LARGE_INTEGER fileSize2;
        fileSize2.LowPart = GetFileSize(hFile2, (LPDWORD)&fileSize2.HighPart);
        if (fileSize2.QuadPart == fileSize1.QuadPart)
        {
          ReadFile(hFile2, Buf2, beginBlockSize, &bytesRead, NULL);

          if (memcmp(Buf1, Buf2, beginBlockSize) == 0)
          {
            bool filesIdentical = (fileSize1.QuadPart == (__int64)beginBlockSize);

            if (!filesIdentical)
            {
              if (!it1->MD5Calculated)
              {
                MD5_Calc md5Calc;
                md5Calc.Write(Buf1, beginBlockSize);
                unsigned __int64 bytesLeft = (unsigned __int64)fileSize1.QuadPart - (unsigned __int64)beginBlockSize;
                DWORD bytesToRead;
                while (bytesLeft > 0)
                {
                  bytesToRead = (DWORD)std::min<unsigned __int64>(bytesLeft, (unsigned __int64)BUF_SIZE);
                  ReadFile(hFile1, _Buf3, bytesToRead, &bytesRead, NULL);
                  assert(bytesRead == bytesToRead);
                  md5Calc.Write(_Buf3, bytesToRead);
                  bytesLeft -= (unsigned __int64)bytesToRead;
                }
                md5Calc.Finish(&it1->MD5);
                it1->MD5Calculated = true;
              }

              if (!it2->MD5Calculated)
              {
                MD5_Calc md5Calc;
                md5Calc.Write(Buf2, beginBlockSize);
                unsigned __int64 bytesLeft = (unsigned __int64)fileSize2.QuadPart - (unsigned __int64)beginBlockSize;
                DWORD bytesToRead;
                while (bytesLeft > 0)
                {
                  bytesToRead = (DWORD)std::min<unsigned __int64>(bytesLeft, (unsigned __int64)BUF_SIZE);
                  ReadFile(hFile2, _Buf3, bytesToRead, &bytesRead, NULL);
                  assert(bytesRead == bytesToRead);
                  md5Calc.Write(_Buf3, bytesToRead);
                  bytesLeft -= (unsigned __int64)bytesToRead;
                }
                md5Calc.Finish(&it2->MD5);
                it2->MD5Calculated = true;
              }

              filesIdentical = (it1->MD5 == it2->MD5);
            }

            if (filesIdentical)
            {
              if (!it1->Reported)
              {
                // Report file described by it1 as first in dupliacted files group
                it1->Reported = true;
              }

              // Report file described by it1 as next in dupliacted files group
              it2->Reported = true;
            }
          }
        }
        CloseHandle(hFile2);
      }
      CloseHandle(hFile1);
    }
  }
}

And finally here is a function to compare two files by content:

bool FilesContentEqual(const FILE_DESC &fd1, const FILE_DESC &fd2)
{
  HANDLE hFile1 = CreateFile(fd1.Path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
  HANDLE hFile2 = CreateFile(fd2.Path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);

  LARGE_INTEGER fileSize1, fileSize2;
  fileSize1.LowPart = GetFileSize(hFile1, (LPDWORD)&fileSize1.HighPart);
  fileSize2.LowPart = GetFileSize(hFile2, (LPDWORD)&fileSize2.HighPart);
  if (fileSize1.QuadPart != fileSize2.QuadPart)
  {
    CloseHandle(hFile2);
    CloseHandle(hFile1);
    return false;
  }

  unsigned __int64 bytesLeft = (unsigned __int64)fileSize1.QuadPart;
  DWORD bytesToRead, bytesRead;
  while (bytesLeft > 0)
  {
    bytesToRead = (DWORD)std::min<unsigned __int64>(bytesLeft, (unsigned __int64)BUF_SIZE);
    ReadFile(hFile1, Buf1, bytesToRead, &bytesRead, NULL);
    ReadFile(hFile2, Buf2, bytesToRead, &bytesRead, NULL);

    if (memcmp(Buf1, Buf2, bytesToRead) != 0)
    {
      CloseHandle(hFile2);
      CloseHandle(hFile1);
      return false;
    }
    bytesLeft -= (unsigned __int64)bytesToRead;
  }

  CloseHandle(hFile2);
  CloseHandle(hFile1);
  return true;
}

Comments | #algorithms #winapi Share