Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,7 @@ doc/build
# spack files
.spack*
spack.lock

.vs/

build/
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[submodule "xxhash"]

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would recommend sticking to declared external dependencies for an upstream PR. debian-latest has xxhash for development and runtime - this project has no submodules and adding one complicates things. I'm adding a docker container build, for example, and would be able to build your PR including these fixes on debian:latest without the submodule.

path = xxhash
url = https://github.com/theAeon/xxhash
branch = xxhash-mfu
8 changes: 5 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ IF(ENABLE_LIBARCHIVE)
ADD_DEFINITIONS(-DLIBARCHIVE_SUPPORT)
ENDIF(ENABLE_LIBARCHIVE)

## hdf5
## hdf5
OPTION(ENABLE_HDF5 "Enable HDF5 library")
MESSAGE(STATUS "ENABLE_HDF5: ${ENABLE_HDF5}")
IF(ENABLE_HDF5)
Expand Down Expand Up @@ -147,8 +147,9 @@ IF(LibCap_FOUND)
LIST(APPEND MFU_EXTERNAL_LIBS ${LibCap_LIBRARIES})
ENDIF(LibCap_FOUND)

## OPENSSL for ddup
FIND_PACKAGE(OpenSSL)
## XXHASH for ddup
set(XXHASH_BUNDLED_MODE ON)
ADD_SUBDIRECTORY(${CMAKE_CURRENT_SOURCE_DIR}/xxhash/cmake_unofficial)

# Setup Installation

Expand Down Expand Up @@ -198,3 +199,4 @@ INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src/common)
ADD_SUBDIRECTORY(src)
ADD_SUBDIRECTORY(test)
ADD_SUBDIRECTORY(man)

2 changes: 1 addition & 1 deletion src/ddup/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
MFU_ADD_TOOL(ddup)
TARGET_LINK_LIBRARIES(ddup ${OPENSSL_LIBRARIES})
TARGET_LINK_LIBRARIES(ddup xxhash)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If using inlined, this is unnecessary. There is no link library.

66 changes: 32 additions & 34 deletions src/ddup/ddup.c
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
#include <stdio.h>
#include <string.h>
#include <getopt.h>
#include <openssl/sha.h>
#include <assert.h>
#include <inttypes.h>
#include <xxh3.h>
#include "mpi.h"
#include "dtcmp.h"
#include "mfu.h"
#include "list.h"

/* number of uint64_t values in our key
* 1 for group ID + (SHA256_DIGEST_LENGTH / 8) */
#define DDUP_KEY_SIZE 5

* 1 for group ID + (XXHASH_DIGEST_LENGTH / 8) */
#define DDUP_KEY_SIZE 2
/*XXH3_64 output is a hexadecimal representation of an unsigned 64 bit integer*/
#define XXH3_DIGEST_LENGTH 8
/* amount of data to read in order to compute hash */
#define DDUP_CHUNK_SIZE 1048576
#define DDUP_CHUNK_SIZE 4096

/* Print a usage message */
static void print_usage(void)
Expand All @@ -35,11 +36,11 @@ static void print_usage(void)
/* create MPI datatypes for key and key and satellite data */
static void mpi_type_init(MPI_Datatype* key, MPI_Datatype* keysat)
{
assert(SHA256_DIGEST_LENGTH == (DDUP_KEY_SIZE - 1) * 8);
assert(XXH3_DIGEST_LENGTH == (DDUP_KEY_SIZE - 1) * 8);

/*
* Build MPI datatype for key.
* 1 for group ID + (SHA256_DIGEST_LENGTH / 8)
* 1 for group ID + (XXH3_DIGEST_LENGTH / 8)
*/
MPI_Type_contiguous(DDUP_KEY_SIZE, MPI_UINT64_T, key);
MPI_Type_commit(key);
Expand Down Expand Up @@ -141,14 +142,14 @@ static int read_data(const char* fname, char* chunk_buf, uint64_t chunk_id,
}

struct file_item {
SHA256_CTX ctx;
XXH3_state_t state;
};

/* print SHA256 value to stdout */
static void dump_sha256_digest(char* digest_string, unsigned char digest[])
/* print XXH3 value to stdout */
static void dump_xxh3_digest(char* digest_string, unsigned char digest[])
{
int i;
for (i = 0; i < SHA256_DIGEST_LENGTH; i++) {
for (i = 0; i < XXH3_DIGEST_LENGTH; i++) {
sprintf(&digest_string[i * 2], "%02x", (unsigned int)digest[i]);
}
}
Expand All @@ -161,7 +162,7 @@ int main(int argc, char** argv)

uint64_t chunk_size = DDUP_CHUNK_SIZE;

SHA256_CTX* ctx_ptr;
XXH3_state_t* state_ptr;

MPI_Init(NULL, NULL);
mfu_init();
Expand Down Expand Up @@ -310,8 +311,8 @@ int main(int argc, char** argv)
/* get local number of items in flist */
uint64_t checking_files = mfu_flist_size(flist);

/* allocate memory to hold SHA256 context values */
struct file_item* file_items = (struct file_item*) MFU_MALLOC(checking_files * sizeof(*file_items));
/* allocate memory to hold XXH3 context values */
struct file_item* file_items = (struct file_item*) XXH_alignedMalloc(checking_files * sizeof(*file_items), 128);

/* Allocate two lists of length size, where each
* element has (DDUP_KEY_SIZE + 1) uint64_t values
Expand Down Expand Up @@ -346,8 +347,9 @@ int main(int argc, char** argv)
/* record our index in flist */
ptr[DDUP_KEY_SIZE] = i;

/* initialize the SHA256 hash state for this file */
SHA256_Init(&file_items[i].ctx);
/* initialize the XXH3 hash state for this file */
XXH3_INITSTATE(&file_items[i].state);
XXH3_64bits_reset(&file_items[i].state);

/* increment our file count */
new_checking_files++;
Expand Down Expand Up @@ -376,7 +378,7 @@ int main(int argc, char** argv)
/* update the chunk id we'll read from all files */
chunk_id++;

/* iterate over our list and compute SHA256 value for each */
/* iterate over our list and compute XXH3 value for each */
ptr = list;
for (i = 0; i < checking_files; i++) {
/* get the flist index for this item */
Expand All @@ -399,18 +401,14 @@ int main(int argc, char** argv)
"process", fname);
}

/* update the SHA256 context for this file */
ctx_ptr = &file_items[idx].ctx;
SHA256_Update(ctx_ptr, chunk_buf, data_size);
/* update the XXH3 context for this file */
state_ptr = &file_items[idx].state;
XXH3_64bits_update(state_ptr, chunk_buf, data_size);

/*
* Use SHA256 value as key.
* This is actually an hack, but SHA256_Final can't
* be called multiple times with out changing ctx
* Use XXH3 digest as key.
*/
SHA256_CTX ctx_tmp;
memcpy(&ctx_tmp, ctx_ptr, sizeof(ctx_tmp));
SHA256_Final((unsigned char*)(ptr + 1), &ctx_tmp);
XXH64_hash_t result = XXH3_64bits_digest(state_ptr);

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
XXH64_hash_t result = XXH3_64bits_digest(state_ptr);
XXH64_hash_t result = XXH3_64bits_digest(state_ptr);
memcpy(ptr + 1, &result, sizeof(result));

Without storing the result, I don't think anything would de-dupe.


/* move on to next file in the list */
ptr += DDUP_KEY_SIZE + 1;
Expand Down Expand Up @@ -441,8 +439,8 @@ int main(int argc, char** argv)
/* look up file size */
file_size = mfu_flist_file_get_size(flist, idx);

/* get a pointer to the SHA256 context for this file */
ctx_ptr = &file_items[idx].ctx;
/* get a pointer to the XXH3 context for this file */
state_ptr = &file_items[idx].state;

if (group_ranks[i] == 1) {
/*
Expand All @@ -457,11 +455,11 @@ int main(int argc, char** argv)
* duplicate with other files that also have
* matching group_id[i]
*/
unsigned char digest[SHA256_DIGEST_LENGTH];
SHA256_Final(digest, ctx_ptr);

char digest_string[SHA256_DIGEST_LENGTH * 2 + 1];
dump_sha256_digest(digest_string, digest);
XXH64_hash_t digest = XXH3_64bits_digest(state_ptr);
XXH64_canonical_t digest_canon;
XXH64_canonicalFromHash(&digest_canon, digest);
char digest_string[XXH3_DIGEST_LENGTH];

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stack Corruption: XXH3_DIGEST_LENGTH is 8 but the string is going to be a hex version with a trailing NUL so we need this to use the * 2 + 1 just like the old code.

dump_xxh3_digest(digest_string, digest_canon.digest);
printf("%s %s\n", fname, digest_string);
} else {
/* Have multiple files with the same checksum,
Expand Down Expand Up @@ -519,7 +517,7 @@ int main(int argc, char** argv)
mfu_free(&group_id);
mfu_free(&new_list);
mfu_free(&list);
mfu_free(&file_items);
XXH_alignedFree(file_items);
mfu_free(&chunk_buf);
mfu_flist_free(&flist);

Expand Down
1 change: 1 addition & 0 deletions xxhash
Submodule xxhash added at a0a745