obitools3/src/utils.c

/****************************************************************************
 * Utility functions                                                       *
 ****************************************************************************/

/**
 * @file utils.c
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 * @date 29 March 2016
 * @brief Code for utility functions.
 */

#include <fcntl.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <unistd.h>
#include <time.h>
#include <math.h>

#include "utils.h"
#include "obidebug.h"
#include "obierrno.h"


#define DEBUG_LEVEL 0	// TODO has to be defined somewhere else (cython compil flag?)


/**************************************************************************
 *
 * D E C L A R A T I O N   O F   T H E   P R I V A T E   F U N C T I O N S
 *
 **************************************************************************/


/**
 * Internal function returning the complement of a nucleotide base.
 *
 * @warning The base must be in lower case.
 *
 * @param nucAc The nucleotide base.
 *
 * @returns The complement of the nucleotide base.
 * @retval The nucleotide base itself if no complement was found.
 *
 * @since December 2016
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 * @note Copied from ecoPCR source code
 */
static char nuc_base_complement(char nucAc);


/**
 * Internal function returning the complement of a nucleotide sequence.
 *
 * @warning The sequence must be in lower case.
 * @warning The sequence will be replaced by its complement without being copied.
 *
 * @param nucAcSeq The nucleotide sequence.
 *
 * @returns The complemented sequence.
 *
 * @since December 2016
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 * @note Copied from ecoPCR source code
 */
static char* nuc_seq_complement(char* nucAcSeq);


/**
 * Internal function returning the reverse of a nucleotide sequence.
 *
 * @warning The sequence must be in lower case.
 * @warning The sequence will be replaced by its reverse without being copied.
 *
 * @param str The nucleotide sequence.
 * @param isPattern Whether the sequence is a pattern. TODO
 *
 * @returns The reversed sequence.
 *
 * @since December 2016
 * @author Celine Mercier (celine.mercier@metabarcoding.org)
 * @note Copied from ecoPCR source code
 */
static char* reverse_sequence(char* str, char isPattern);


/************************************************************************
 *
 * D E F I N I T I O N   O F   T H E   P R I V A T E   F U N C T I O N S
 *
 ************************************************************************/

static char nuc_base_complement(char nucAc)
{
    char* c;

    if ((c = strchr(DNA_ALPHA, nucAc)))
        return CDNA_ALPHA[(c - DNA_ALPHA)];
    else
        return nucAc;
}


static char* nuc_seq_complement(char* nucAcSeq)
{
    char *s;

    for (s = nucAcSeq ; *s ; s++)
        *s = nuc_base_complement(*s);

    return nucAcSeq;
}


static char* reverse_sequence(char* str, char isPattern)
{
        char *sb, *se, c;

        if (! str)
            return str;

        sb = str;
        se = str + strlen(str) - 1;

        while(sb <= se) {
           c    = *sb;
          *sb++ = *se;
          *se-- = c;
        }

		sb = str;
		se = str + strlen(str) - 1;

		if (isPattern)
			for (;sb < se; sb++)
			{
				if (*sb=='#')
				{
					if (((se - sb) > 2) && (*(sb+2)=='!'))
					{
						*sb='!';
						sb+=2;
						*sb='#';
					}
					else
					{
						*sb=*(sb+1);
						sb++;
						*sb='#';
					}
				}
				else if (*sb=='!')
					{
						*sb=*(sb-1);
						*(sb-1)='!';
					}
			}

        return str;
}


/**********************************************************************
 *
 * D E F I N I T I O N   O F   T H E   P U B L I C   F U N C T I O N S
 *
 **********************************************************************/


int copy_file(const char* src_file_path, const char* dest_file_path)
{
	int src_fd, dst_fd, n, err;
	unsigned char buffer[4096];

	src_fd = open(src_file_path, O_RDONLY);
	if (src_fd == -1)
    {
		obi_set_errno(OBI_UTILS_ERROR);
		obidebug(1, "\nError opening a file to copy");
        return -1;
    }
    dst_fd = open(dest_file_path, O_CREAT | O_WRONLY, 0777);  // overwrite if already exists
    if (dst_fd == -1)
    {
		obi_set_errno(OBI_UTILS_ERROR);
		obidebug(1, "\nError opening a file to write a copy: %s", dest_file_path);
        return -1;
    }

    while (1)
    {
        err = read(src_fd, buffer, 4096);
        if (err == -1)
        {
    		obi_set_errno(OBI_UTILS_ERROR);
    		obidebug(1, "\nProblem reading a file to copy");
            return -1;
        }
        n = err;

        if (n == 0)
        	break;

        err = write(dst_fd, buffer, n);
        if (err == -1)
        {
    		obi_set_errno(OBI_UTILS_ERROR);
    		obidebug(1, "\nProblem writing to a file while copying");
            return -1;
        }
    }

    if (close(src_fd) < 0)
    {
		obi_set_errno(OBI_UTILS_ERROR);
		obidebug(1, "\nError closing a file after copying it");
        return -1;
    }
    if (close(dst_fd) < 0)
    {
		obi_set_errno(OBI_UTILS_ERROR);
		obidebug(1, "\nError closing a file after copying to it");
        return -1;
    }

    return 0;
}


int digit_count(index_t i)
{
	int n_digits;

	if (i == 0)
		n_digits = 1;
	else
		n_digits = floor(log10(llabs(i))) + 1;

	return n_digits;
}


char* build_word_with_idx(const char* prefix, index_t idx)
{
	char* word;
	int   n_digits;

	n_digits = digit_count(idx);
	word = (char*) malloc((strlen(prefix) + 1+ n_digits + 1)*sizeof(char));
	if (word == NULL)
	{
		obi_set_errno(OBI_MALLOC_ERROR);
		obidebug(1, "\nError allocating memory for a character string");
		return NULL;
	}
	if (sprintf(word, "%s_%lld", prefix, idx) < 0)
	{
		obi_set_errno(OBI_UTILS_ERROR);
		obidebug(1, "\nProblem building a word from a prefix and an index");
		return NULL;
	}

	return word;
}


int count_dir(char* dir_path)
{
	struct dirent* dp;
	DIR*           fd;
	int            count;

	count = 0;
	if ((fd = opendir(dir_path)) == NULL)
	{
		obi_set_errno(OBI_UTILS_ERROR);
		obidebug(1, "Error opening a directory: %s\n", dir_path);
		return -1;
	}
	while ((dp = readdir(fd)) != NULL)
	{
		if ((dp->d_name)[0] == '.')
			continue;
		count++;
	}

	if (closedir(fd) < 0)
	{
		obi_set_errno(OBI_UTILS_ERROR);
		obidebug(1, "\nError closing a directory");
		return -1;
	}

	return count;
}


char* obi_format_date(time_t date)
{
	char*       formatted_time;
	struct tm*  tmp;

	formatted_time = (char*) malloc(FORMATTED_TIME_LENGTH*sizeof(char));
	if (formatted_time == NULL)
	{
		obi_set_errno(OBI_MALLOC_ERROR);
		obidebug(1, "\nError allocating memory to format a date");
		return NULL;
	}

	tmp = localtime(&date);
	if (tmp == NULL)
	{
		obi_set_errno(OBICOL_UNKNOWN_ERROR);
		obidebug(1, "\nError formatting a date");
		return NULL;
	}

	if (strftime(formatted_time, FORMATTED_TIME_LENGTH, "%c", tmp) == 0)
	{
		obi_set_errno(OBICOL_UNKNOWN_ERROR);
		obidebug(1, "\nError formatting a date");
		return NULL;
	}

	return formatted_time;
}


void* obi_get_memory_aligned_on_16(int size, int* shift)
{
	void* memory;

	*shift = 0;

	memory = (void*) malloc(size);
	if (memory == NULL)
	{
		obi_set_errno(OBI_MALLOC_ERROR);
		obidebug(1, "\nError allocating memory");
		return NULL;
	}

	while ((((long long unsigned int) (memory))%16) != 0)
	{
		memory++;
		(*shift)++;
	}

	return (memory);
}


/*
 * A generic implementation of binary search for the Linux kernel
 *
 * Copyright (C) 2008-2009 Ksplice, Inc.
 * Author: Tim Abbott <tabbott@ksplice.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; version 2.
 */
void* bsearch_user_data(const void* key, const void* base, size_t num, size_t size, const void* user_data,
              	  	  	int (*cmp)(const void *key, const void *elt, const void* user_data))
{
	size_t start = 0;
	size_t end   = num;
	size_t mid;
    int result;

    while (start < end)
    {
    	mid = start + (end - start) / 2;
    	result = cmp(key, base + mid * size, user_data);
		if (result < 0)
			end = mid;
		else if (result > 0)
			start = mid + 1;
		else
			return (void*)base + mid * size;
    }

    return NULL;
}


/*
 * Copyright (c) 1992, 1993
 *      The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
/*
 * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
 */

#define MIN(a,b) ((a) < (b) ? a : b)

#define swapcode(TYPE, parmi, parmj, n) {               \
        long i = (n) / sizeof (TYPE);                   \
        register TYPE *pi = (TYPE *) (parmi);           \
        register TYPE *pj = (TYPE *) (parmj);           \
        do {                                            \
                register TYPE   t = *pi;                \
                *pi++ = *pj;                            \
                *pj++ = t;                              \
        } while (--i > 0);                              \
}

#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \
        es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;

static __inline void
swapfunc(char *a, char *b, int n, int swaptype)
{
        if (swaptype <= 1)
                swapcode(long, a, b, n)
        else
                swapcode(char, a, b, n)
}

#define swap(a, b)                                      \
        if (swaptype == 0) {                            \
                long t = *(long *)(a);                  \
                *(long *)(a) = *(long *)(b);            \
                *(long *)(b) = t;                       \
        } else                                          \
                swapfunc(a, b, es, swaptype)

#define vecswap(a, b, n)        if ((n) > 0) swapfunc(a, b, n, swaptype)

static __inline char *
med3(char *a, char *b, char *c, const void *user_data, int (*cmp)(const void *, const void *, const void *))
{
        return cmp(a, b, user_data) < 0 ?
               (cmp(b, c, user_data) < 0 ? b : (cmp(a, c, user_data) < 0 ? c : a ))
              :(cmp(b, c, user_data) > 0 ? b : (cmp(a, c, user_data) < 0 ? a : c ));
}

void
qsort_user_data(void *aa, size_t n, size_t es, const void *user_data, int (*cmp)(const void *, const void *, const void *))
{
        char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
        int d, r, swaptype, swap_cnt;
        register char *a = aa;

loop:   SWAPINIT(a, es);
        swap_cnt = 0;
        if (n < 7) {
                for (pm = (char *)a + es; pm < (char *) a + n * es; pm += es)
                        for (pl = pm; pl > (char *) a && cmp(pl - es, pl, user_data) > 0;
                             pl -= es)
                                swap(pl, pl - es);
                return;
        }
        pm = (char *)a + (n / 2) * es;
        if (n > 7) {
                pl = (char *)a;
                pn = (char *)a + (n - 1) * es;
                if (n > 40) {
                        d = (n / 8) * es;
                        pl = med3(pl, pl + d, pl + 2 * d, user_data, cmp);
                        pm = med3(pm - d, pm, pm + d, user_data, cmp);
                        pn = med3(pn - 2 * d, pn - d, pn, user_data, cmp);
                }
                pm = med3(pl, pm, pn, user_data, cmp);
        }
        swap(a, pm);
        pa = pb = (char *)a + es;

        pc = pd = (char *)a + (n - 1) * es;
        for (;;) {
                while (pb <= pc && (r = cmp(pb, a, user_data)) <= 0) {
                        if (r == 0) {
                                swap_cnt = 1;
                                swap(pa, pb);
                                pa += es;
                        }
                        pb += es;
                }
                while (pb <= pc && (r = cmp(pc, a, user_data)) >= 0) {
                        if (r == 0) {
                                swap_cnt = 1;
                                swap(pc, pd);
                                pd -= es;
                        }
                        pc -= es;
                }
                if (pb > pc)
                        break;
                swap(pb, pc);
                swap_cnt = 1;
                pb += es;
                pc -= es;
        }
        if (swap_cnt == 0) {  /* Switch to insertion sort */
                for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
                        for (pl = pm; pl > (char *) a && cmp(pl - es, pl, user_data) > 0;
                             pl -= es)
                                swap(pl, pl - es);
                return;
        }

        pn = (char *)a + n * es;
        r = MIN(pa - (char *)a, pb - pa);
        vecswap(a, pb - r, r);
        r = MIN((long)(pd - pc), (long)(pn - pd - es));
        vecswap(pb, pn - r, r);
        if ((r = pb - pa) > (int)es)
                qsort_user_data(a, r / es, es, user_data, cmp);
        if ((r = pd - pc) > (int)es) {
                /* Iterate rather than recurse to save stack space */
                a = pn - r;
                n = r / es;
                goto loop;
        }
/*              qsort(pn - r, r / es, es, cmp);*/
}


char* reverse_complement_pattern(char* nucAcSeq)
{
    return reverse_sequence(nuc_seq_complement(nucAcSeq), 1);
}


char* reverse_complement_sequence(char* nucAcSeq)
{
    return reverse_sequence(nuc_seq_complement(nucAcSeq), 0);
}