Logo Search packages:      
Sourcecode: hdf5 version File versions

H5FDmpiposix.c

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 * Copyright by the Board of Trustees of the University of Illinois.         *
 * All rights reserved.                                                      *
 *                                                                           *
 * This file is part of HDF5.  The full HDF5 copyright notice, including     *
 * terms governing use, modification, and redistribution, is contained in    *
 * the files COPYING and Copyright.html.  COPYING can be found at the root   *
 * of the source code distribution tree; Copyright.html can be found at the  *
 * root level of an installed copy of the electronic HDF5 document set and   *
 * is linked from the top-level documents page.  It can also be found at     *
 * http://hdf.ncsa.uiuc.edu/HDF5/doc/Copyright.html.  If you do not have     *
 * access to either file, you may request a copy from hdfhelp@ncsa.uiuc.edu. *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/*
 * Programmer:  Quincey Koziol <koziol@ncsa.uiuc.ed>
 *              Thursday, July 11, 2002
 *
 * Purpose: This is a "combination" MPI-2 and posix I/O driver.
 *              It uses MPI for coordinating the actions of several processes
 *              and posix I/O calls to do the actual I/O to the disk.
 *
 *              This driver was derived from the H5FDmpio.c driver and may
 *              share bugs/quirks/etc.
 *
 * Limitations:
 *              There is no "collective" I/O mode with this driver.
 *
 *              This will almost certainly _not_ work correctly for files
 *              accessed on distributed parallel systems with the file located
 *              on a non-parallel filesystem.
 *
 */
#include "H5private.h"        /*library functions                 */
#include "H5ACprivate.h"        /* Metadata cache */
#include "H5Eprivate.h"       /*error handling              */
#include "H5Fprivate.h"       /*files                             */
#include "H5FDprivate.h"      /*file driver                         */
#include "H5FDmpiposix.h"       /* MPI/posix I/O file driver            */
#include "H5Iprivate.h"       /* IDs                              */
#include "H5MMprivate.h"        /*memory allocation                     */
#include "H5Pprivate.h"       /*property lists              */

/* Features:
 *   H5_HAVE_GPFS   -- issue gpfs_fcntl() calls to hopefully improve
 *                     performance when accessing files on a GPFS
 *                     file system.
 *
 *   REPORT_IO      -- if set then report all POSIX file calls to stderr.
 *
 */
/* #define REPORT_IO */

#ifdef H5_HAVE_GPFS
#   include <gpfs_fcntl.h>
#endif

#ifdef H5_HAVE_PARALLEL

/*
 * The driver identification number, initialized at runtime if H5_HAVE_PARALLEL
 * is defined. This allows applications to still have the H5FD_MPIPOSIX
 * "constants" in their source code (it also makes this file strictly ANSI
 * compliant when H5_HAVE_PARALLEL isn't defined)
 */
static hid_t H5FD_MPIPOSIX_g = 0;

/* File operations */
#define OP_UNKNOWN      0
#define OP_READ         1
#define OP_WRITE  2

/*
 * The description of a file belonging to this driver.
 * The EOF value
 * is only used just after the file is opened in order for the library to
 * determine whether the file is empty, truncated, or okay. The MPIPOSIX driver
 * doesn't bother to keep it updated since it's an expensive operation.
 */
typedef struct H5FD_mpiposix_t {
    H5FD_t  pub;        /*public stuff, must be first       */
    int           fd;         /*the unix file handle                */
    MPI_Comm      comm;       /*communicator                      */
    int         mpi_rank;       /* This process's rank                  */
    int         mpi_size;       /* Total number of processes            */
    haddr_t eof;        /*end-of-file marker                */
    haddr_t eoa;        /*end-of-address marker             */
    haddr_t last_eoa;   /* Last known end-of-address marker */
    haddr_t pos;        /* Current file I/O position          */
    int           op;         /* Last file I/O operation          */
    hsize_t naccess;    /* Number of (write) accesses to file   */
    size_t      blksize;        /* Block size of file system            */
    hbool_t     use_gpfs;       /* Use GPFS to write things             */
#ifndef WIN32
    /*
     * On most systems the combination of device and i-node number uniquely
     * identify a file.
     */
    dev_t   device;           /*file device number          */
    ino_t   inode;            /*file i-node number          */
#else
    /*
     * On WIN32 the low-order word of a unique identifier associated with the
     * file and the volume serial number uniquely identify a file. This number
     * (which, both? -rpm) may change when the system is restarted or when the
     * file is opened. After a process opens a file, the identifier is
     * constant until the file is closed. An application can use this
     * identifier and the volume serial number to determine whether two
     * handles refer to the same file.
     */
    int fileindexlo;
    int fileindexhi;
#endif
} H5FD_mpiposix_t;

/*
 * This driver supports systems that have the lseek64() function by defining
 * some macros here so we don't have to have conditional compilations later
 * throughout the code.
 *
 * file_offset_t: The datatype for file offsets, the second argument of
 *                the lseek() or lseek64() call.
 *
 * file_seek:           The function which adjusts the current file position,
 *                either lseek() or lseek64().
 */
/* adding for windows NT file system support. */
/* pvn: added __MWERKS__ support. */

#ifdef H5_HAVE_LSEEK64
#   define file_offset_t      off64_t
#   define file_seek          lseek64
#   define file_truncate      ftruncate64
#elif defined (WIN32) && !defined(__MWERKS__)
# /*MSVC*/
#   define file_offset_t __int64
#   define file_seek _lseeki64
#   define file_truncate      _ftruncatei64
#else
#   define file_offset_t      off_t
#   define file_seek          HDlseek
#   define file_truncate      HDftruncate
#endif

/*
 * These macros check for overflow of various quantities.  These macros
 * assume that file_offset_t is signed and haddr_t and size_t are unsigned.
 * 
 * ADDR_OVERFLOW: Checks whether a file address of type `haddr_t'
 *                is too large to be represented by the second argument
 *                of the file seek function.
 *
 * SIZE_OVERFLOW: Checks whether a buffer size of type `hsize_t' is too
 *                large to be represented by the `size_t' type.
 *
 * REGION_OVERFLOW:     Checks whether an address and size pair describe data
 *                which can be addressed entirely by the second
 *                argument of the file seek function.
 */
#define MAXADDR (((haddr_t)1<<(8*sizeof(file_offset_t)-1))-1)
#define ADDR_OVERFLOW(A)      (HADDR_UNDEF==(A) ||                      \
                         ((A) & ~(haddr_t)MAXADDR))
#define SIZE_OVERFLOW(Z)      ((Z) & ~(hsize_t)MAXADDR)
#define REGION_OVERFLOW(A,Z)  (ADDR_OVERFLOW(A) || SIZE_OVERFLOW(Z) ||      \
                         sizeof(file_offset_t)<sizeof(size_t) ||      \
                                 HADDR_UNDEF==(A)+(Z) ||                \
                         (file_offset_t)((A)+(Z))<(file_offset_t)(A))

/* Callbacks */
static void *H5FD_mpiposix_fapl_get(H5FD_t *_file);
static void *H5FD_mpiposix_fapl_copy(const void *_old_fa);
static herr_t H5FD_mpiposix_fapl_free(void *_fa);
static H5FD_t *H5FD_mpiposix_open(const char *name, unsigned flags, hid_t fapl_id,
                        haddr_t maxaddr);
static herr_t H5FD_mpiposix_close(H5FD_t *_file);
static int H5FD_mpiposix_cmp(const H5FD_t *_f1, const H5FD_t *_f2);
static herr_t H5FD_mpiposix_query(const H5FD_t *_f1, unsigned long *flags);
static haddr_t H5FD_mpiposix_get_eoa(H5FD_t *_file);
static herr_t H5FD_mpiposix_set_eoa(H5FD_t *_file, haddr_t addr);
static haddr_t H5FD_mpiposix_get_eof(H5FD_t *_file);
static herr_t  H5FD_mpiposix_get_handle(H5FD_t *_file, hid_t fapl, void** file_handle);
static herr_t H5FD_mpiposix_read(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr,
        size_t size, void *buf);
static herr_t H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr,
        size_t size, const void *buf);
static herr_t H5FD_mpiposix_flush(H5FD_t *_file, hid_t dxpl_id, unsigned closing);

/* MPIPOSIX-specific file access properties */
typedef struct H5FD_mpiposix_fapl_t {
    hbool_t             use_gpfs;       /*use GPFS hints                */
    MPI_Comm            comm;       /*communicator                */
} H5FD_mpiposix_fapl_t;

/* The MPIPOSIX file driver information */
static const H5FD_class_t H5FD_mpiposix_g = {
    "mpiposix",                           /*name                  */
    MAXADDR,                              /*maxaddr         */
    H5F_CLOSE_SEMI,                       /* fc_degree            */
    NULL,                           /*sb_size         */
    NULL,                           /*sb_encode       */
    NULL,                           /*sb_decode       */
    sizeof(H5FD_mpiposix_fapl_t),         /*fapl_size       */
    H5FD_mpiposix_fapl_get,               /*fapl_get        */
    H5FD_mpiposix_fapl_copy,              /*fapl_copy       */
    H5FD_mpiposix_fapl_free,              /*fapl_free       */
    0,                                    /*dxpl_size       */
    NULL,                           /*dxpl_copy       */
    NULL,                           /*dxpl_free       */
    H5FD_mpiposix_open,                   /*open                  */
    H5FD_mpiposix_close,                  /*close                 */
    H5FD_mpiposix_cmp,                      /*cmp                 */
    H5FD_mpiposix_query,                    /*query               */
    NULL,                           /*alloc                 */
    NULL,                           /*free                  */
    H5FD_mpiposix_get_eoa,                /*get_eoa         */
    H5FD_mpiposix_set_eoa,                /*set_eoa         */
    H5FD_mpiposix_get_eof,                /*get_eof         */
    H5FD_mpiposix_get_handle,                   /*get_handle            */
    H5FD_mpiposix_read,                   /*read                  */
    H5FD_mpiposix_write,                  /*write                 */
    H5FD_mpiposix_flush,                  /*flush                 */
    NULL,                                       /*lock                  */
    NULL,                                       /*unlock                */
    H5FD_FLMAP_SINGLE                     /*fl_map          */
};

/* Interface initialization */
#define PABLO_MASK      H5FD_mpiposix_mask
#define INTERFACE_INIT  H5FD_mpiposix_init
static int interface_initialize_g = 0;


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_init
 *
 * Purpose: Initialize this driver by registering the driver with the
 *          library.
 *
 * Return:  Success:    The driver ID for the mpiposix driver.
 *
 *          Failure:    Negative.
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
hid_t
H5FD_mpiposix_init(void)
{
    hid_t ret_value=H5FD_MPIPOSIX_g;    /* Return value */

    FUNC_ENTER_NOAPI(H5FD_mpiposix_init, FAIL);

    if (H5I_VFL!=H5Iget_type(H5FD_MPIPOSIX_g))
        H5FD_MPIPOSIX_g = H5FDregister(&H5FD_mpiposix_g);

    /* Set return value */
    ret_value=H5FD_MPIPOSIX_g;

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_init() */

#ifdef H5_WANT_H5_V1_4_COMPAT

/*-------------------------------------------------------------------------
 * Function:      H5Pset_fapl_mpiposix
 *
 * Purpose: Store the user supplied MPI communicator COMM in
 *          the file access property list FAPL_ID which can then be used
 *          to create and/or open the file.  This function is available
 *          only in the parallel HDF5 library and is not collective.
 *
 *          comm is the MPI communicator to be used for file open as
 *          defined in MPI_FILE_OPEN of MPI-2. This function makes a
 *          duplicate of comm. Any modification to comm after this function
 *          call returns has no effect on the access property list.
 *
 *              If fapl_id has previously set comm value, it will be replaced
 *              and the old communicator is freed.
 *
 * Return:  Success:    Non-negative
 *          Failure:    Negative
 *
 * Programmer:    Quincey Koziol
 *          Thursday, July 11, 2002
 *
 * Modifications:
 *          Albert Cheng, 2003-04-24
 *          Modified the description of the function that it now stores
 *          a duplicate of the communicator.  Free the old duplicate if
 *          previously set.  (Work is actually done by H5P_set_driver.)
 *
 *-------------------------------------------------------------------------
 */
herr_t
H5Pset_fapl_mpiposix(hid_t fapl_id, MPI_Comm comm)
{
    H5FD_mpiposix_fapl_t      fa;
    H5P_genplist_t *plist;      /* Property list pointer */
    herr_t ret_value;
    
    FUNC_ENTER_API(H5Pset_fapl_mpiposix, FAIL);
    H5TRACE2("e","iMc",fapl_id,comm);

    /* Check arguments */
    if(NULL == (plist = H5P_object_verify(fapl_id,H5P_FILE_ACCESS)))
        HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list");
    if (MPI_COMM_NULL == comm)
      HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a valid communicator");

    /* Initialize driver specific properties */
    fa.comm = comm;
    fa.use_gpfs = FALSE;

    /* duplication is done during driver setting. */
    ret_value= H5P_set_driver(plist, H5FD_MPIPOSIX, &fa);

done:
    FUNC_LEAVE_API(ret_value);
} /* end H5Pset_fapl_mpiposix() */


/*-------------------------------------------------------------------------
 * Function:      H5Pget_fapl_mpiposix
 *
 * Purpose: If the file access property list is set to the H5FD_MPIPOSIX
 *          driver then this function returns a duplicate of the MPI
 *          communicator through the comm pointer. It is the responsibility
 *          of the application to free the returned communicator.
 *
 * Return:  Success:    Non-negative with the communicator and
 *                      information returned through the COMM 
 *                      argument if non-null.  Since it is a duplicate
 *                      of the stored object, future modifications to
 *                      the access property list do not affect it and
 *                      it is the responsibility of the application to
 *                      free it.
 *
 *          Failure:    Negative
 *
 * Programmer:    Quincey Koziol
 *          Thursday, July 11, 2002
 *
 * Modifications:
 *          Albert Cheng, 2003-04-24
 *          Return duplicate of the stored communicator.
 *
 *              Bill Wendling, 2003-05-01
 *              Return the USE_GPFS flag.
 *
 *-------------------------------------------------------------------------
 */
herr_t
H5Pget_fapl_mpiposix(hid_t fapl_id, MPI_Comm *comm/*out*/)
{
    H5FD_mpiposix_fapl_t      *fa;
    H5P_genplist_t *plist;      /* Property list pointer */
    int           mpi_code;         /* mpi return code */
    herr_t      ret_value=SUCCEED;      /* Return value */
    
    FUNC_ENTER_API(H5Pget_fapl_mpiposix, FAIL);
    H5TRACE2("e","ix",fapl_id,comm);

    if(NULL == (plist = H5P_object_verify(fapl_id,H5P_FILE_ACCESS)))
        HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list");
    if (H5FD_MPIPOSIX!=H5P_get_driver(plist))
        HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "incorrect VFL driver");
    if (NULL==(fa=H5P_get_driver_info(plist)))
        HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "bad VFL driver info");

    /* Get MPI Communicator */
    if (comm){
      if (MPI_SUCCESS != (mpi_code=MPI_Comm_dup(fa->comm, comm)))
          HMPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code);
    }

done:
    FUNC_LEAVE_API(ret_value);
} /* end H5Pget_fapl_mpiposix() */
#else /* H5_WANT_H5_V1_4_COMPAT */

/*-------------------------------------------------------------------------
 * Function:      H5Pset_fapl_mpiposix
 *
 * Purpose: Store the user supplied MPI communicator COMM in
 *          the file access property list FAPL_ID which can then be used
 *          to create and/or open the file.  This function is available
 *          only in the parallel HDF5 library and is not collective.
 *
 *          comm is the MPI communicator to be used for file open as
 *          defined in MPI_FILE_OPEN of MPI-2. This function makes a
 *          duplicate of comm. Any modification to comm after this function
 *          call returns has no effect on the access property list.
 *
 *              If fapl_id has previously set comm value, it will be replaced
 *              and the old communicator is freed.
 *
 * Return:  Success:    Non-negative
 *          Failure:    Negative
 *
 * Programmer:    Quincey Koziol
 *          Thursday, July 11, 2002
 *
 * Modifications:
 *          Albert Cheng, 2003-04-24
 *          Modified the description of the function that it now stores
 *          a duplicate of the communicator.  Free the old duplicate if
 *          previously set.  (Work is actually done by H5P_set_driver.)
 *
 *          Bill Wendling, 2003-05-01
 *          Modified to take an extra flag indicating that we should
 *          use the GPFS hints (if available) for this file.
 *
 *-------------------------------------------------------------------------
 */
herr_t
H5Pset_fapl_mpiposix(hid_t fapl_id, MPI_Comm comm, hbool_t use_gpfs)
{
    H5FD_mpiposix_fapl_t      fa;
    H5P_genplist_t *plist;      /* Property list pointer */
    herr_t ret_value;
    
    FUNC_ENTER_API(H5Pset_fapl_mpiposix, FAIL);
    H5TRACE3("e","iMcb",fapl_id,comm,use_gpfs);

    /* Check arguments */
    if(NULL == (plist = H5P_object_verify(fapl_id,H5P_FILE_ACCESS)))
        HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list");
    if (MPI_COMM_NULL == comm)
      HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a valid communicator");

    /* Initialize driver specific properties */
    fa.comm = comm;
    fa.use_gpfs = use_gpfs;

    /* duplication is done during driver setting. */
    ret_value= H5P_set_driver(plist, H5FD_MPIPOSIX, &fa);

done:
    FUNC_LEAVE_API(ret_value);
} /* end H5Pset_fapl_mpiposix() */


/*-------------------------------------------------------------------------
 * Function:      H5Pget_fapl_mpiposix
 *
 * Purpose: If the file access property list is set to the H5FD_MPIPOSIX
 *          driver then this function returns a duplicate of the MPI
 *          communicator through the comm pointer. It is the responsibility
 *          of the application to free the returned communicator.
 *
 * Return:  Success:    Non-negative with the communicator and
 *                      information returned through the COMM 
 *                      argument if non-null.  Since it is a duplicate
 *                      of the stored object, future modifications to
 *                      the access property list do not affect it and
 *                      it is the responsibility of the application to
 *                      free it.
 *
 *          Failure:    Negative
 *
 * Programmer:    Quincey Koziol
 *          Thursday, July 11, 2002
 *
 * Modifications:
 *          Albert Cheng, 2003-04-24
 *          Return duplicate of the stored communicator.
 *
 *              Bill Wendling, 2003-05-01
 *              Return the USE_GPFS flag.
 *
 *-------------------------------------------------------------------------
 */
herr_t
H5Pget_fapl_mpiposix(hid_t fapl_id, MPI_Comm *comm/*out*/, hbool_t *use_gpfs/*out*/)
{
    H5FD_mpiposix_fapl_t      *fa;
    H5P_genplist_t *plist;      /* Property list pointer */
    int           mpi_code;         /* mpi return code */
    herr_t      ret_value=SUCCEED;      /* Return value */
    
    FUNC_ENTER_API(H5Pget_fapl_mpiposix, FAIL);
    H5TRACE3("e","ixx",fapl_id,comm,use_gpfs);

    if(NULL == (plist = H5P_object_verify(fapl_id,H5P_FILE_ACCESS)))
        HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list");
    if (H5FD_MPIPOSIX!=H5P_get_driver(plist))
        HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "incorrect VFL driver");
    if (NULL==(fa=H5P_get_driver_info(plist)))
        HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "bad VFL driver info");

    /* Get MPI Communicator */
    if (comm){
      if (MPI_SUCCESS != (mpi_code=MPI_Comm_dup(fa->comm, comm)))
          HMPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code);
    }

    if (use_gpfs)
        *use_gpfs = fa->use_gpfs;

done:
    FUNC_LEAVE_API(ret_value);
} /* end H5Pget_fapl_mpiposix() */
#endif /* H5_WANT_H5_V1_4_COMPAT */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_communicator
 *
 * Purpose: Returns the MPI communicator for the file.
 *
 * Return:  Success:    The communicator
 *
 *          Failure:    NULL
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
MPI_Comm
H5FD_mpiposix_communicator(H5FD_t *_file)
{
    H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
    MPI_Comm ret_value;         /* Return value */

    FUNC_ENTER_NOAPI(H5FD_mpiposix_communicator, MPI_COMM_NULL);

    assert(file);
    assert(H5FD_MPIPOSIX==file->pub.driver_id);

    /* Set return value */
    ret_value=file->comm;

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpi_posix_communicator() */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_mpi_rank
 *
 * Purpose: Returns the MPI rank for a process
 *
 * Return:  Success: non-negative
 *          Failure: negative
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
int
H5FD_mpiposix_mpi_rank(H5FD_t *_file)
{
    H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
    int ret_value;      /* Return value */

    FUNC_ENTER_NOAPI(H5FD_mpiposix_mpi_rank, FAIL);

    assert(file);
    assert(H5FD_MPIPOSIX==file->pub.driver_id);

    /* Set return value */
    ret_value=file->mpi_rank;

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_mpi_rank() */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_mpi_size
 *
 * Purpose: Returns the number of MPI processes
 *
 * Return:  Success: non-negative
 *          Failure: negative
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
int
H5FD_mpiposix_mpi_size(H5FD_t *_file)
{
    H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
    int ret_value;      /* Return value */

    FUNC_ENTER_NOAPI(H5FD_mpiposix_mpi_rank, FAIL);

    assert(file);
    assert(H5FD_MPIPOSIX==file->pub.driver_id);

    /* Set return value */
    ret_value=file->mpi_size;

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_mpi_size() */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_fapl_get
 *
 * Purpose: Returns a file access property list which could be used to
 *          create another file the same as this one.
 *
 * Return:  Success:    Ptr to new file access property list with all
 *                      fields copied from the file pointer.
 *
 *          Failure:    NULL
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *          Albert Cheng, 2003-04-24
 *          Duplicate the communicator object so that the new
 *          property list is insulated from the old one.
 *
 *-------------------------------------------------------------------------
 */
static void *
H5FD_mpiposix_fapl_get(H5FD_t *_file)
{
    H5FD_mpiposix_t     *file = (H5FD_mpiposix_t*)_file;
    H5FD_mpiposix_fapl_t *fa = NULL;
    int           mpi_code;   /* MPI return code */
    void        *ret_value;     /* Return value */

    FUNC_ENTER_NOAPI(H5FD_mpiposix_fapl_get, NULL);

    assert(file);
    assert(H5FD_MPIPOSIX==file->pub.driver_id);

    if (NULL==(fa=H5MM_calloc(sizeof(H5FD_mpiposix_fapl_t))))
        HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed");

    /* Duplicate the communicator. */
    if (MPI_SUCCESS != (mpi_code=MPI_Comm_dup(file->comm, &fa->comm)))
      HMPI_GOTO_ERROR(NULL, "MPI_Comm_dup failed", mpi_code);
    
    fa->use_gpfs = file->use_gpfs;

    /* Set return value */
    ret_value=fa;

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_fapl_get() */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_fapl_copy
 *
 * Purpose: Copies the mpiposix-specific file access properties.
 *
 * Return:  Success:    Ptr to a new property list
 *
 *          Failure:    NULL
 *
 * Programmer:    Albert Cheng
 *              Apr 24, 2003
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
static void *
H5FD_mpiposix_fapl_copy(const void *_old_fa)
{
    void          *ret_value = NULL;
    const H5FD_mpiposix_fapl_t *old_fa = (const H5FD_mpiposix_fapl_t*)_old_fa;
    H5FD_mpiposix_fapl_t      *new_fa = NULL;
    int           mpi_code;   /* MPI return code */
    
    FUNC_ENTER_NOAPI(H5FD_mpiposix_fapl_copy, NULL);

    if (NULL==(new_fa=H5MM_malloc(sizeof(H5FD_mpiposix_fapl_t))))
        HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed");

    /* Copy the general information */
    HDmemcpy(new_fa, old_fa, sizeof(H5FD_mpiposix_fapl_t));

    /* Duplicate communicator. */
    if (MPI_SUCCESS != (mpi_code=MPI_Comm_dup(old_fa->comm, &new_fa->comm)))
      HMPI_GOTO_ERROR(NULL, "MPI_Comm_dup failed", mpi_code);

    new_fa->use_gpfs = old_fa->use_gpfs;
    ret_value = new_fa;

done:
    if (NULL == ret_value){
      /* cleanup */
      if (new_fa)
          H5MM_xfree(new_fa);
    }

    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_fapl_copy() */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_fapl_free
 *
 * Purpose: Frees the mpiposix-specific file access properties.
 *
 * Return:  Success:    0
 *
 *          Failure:    -1
 *
 * Programmer:    Albert Cheng
 *              Apr 24, 2003
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_fapl_free(void *_fa)
{
    herr_t        ret_value = SUCCEED;
    H5FD_mpiposix_fapl_t      *fa = (H5FD_mpiposix_fapl_t*)_fa;

    FUNC_ENTER_NOAPI(H5FD_mpiposix_fapl_free, FAIL);
    assert(fa);

    /* Free the internal communicator */
    assert(MPI_COMM_NULL!=fa->comm);
    MPI_Comm_free(&fa->comm);
    H5MM_xfree(fa);

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_fapl_free() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_open
 *
 * Purpose:     Opens a file with name NAME.  The FLAGS are a bit field with
 *          purpose similar to the second argument of open(2) and which
 *          are defined in H5Fpublic.h. The file access property list
 *          FAPL_ID contains the properties driver properties and MAXADDR
 *          is the largest address which this file will be expected to
 *          access.  This is collective.
 *
 * Return:      Success:        A new file pointer.
 *              Failure:        NULL
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *          Albert Cheng, 2003-04-24
 *          Duplicate the communicator so that file is insulated from the
 *          old one.
 *
 *-------------------------------------------------------------------------
 */
static H5FD_t *
H5FD_mpiposix_open(const char *name, unsigned flags, hid_t fapl_id,
             haddr_t maxaddr)
{
    H5FD_mpiposix_t           *file=NULL;     /* New MPIPOSIX file struct */
    int                         o_flags;        /* Flags for file open call */
    int                         fd=(-1);        /* File handle for file opened */
    int                       mpi_rank;       /* MPI rank of this process */
    int                       mpi_size;       /* Total number of MPI processes */
    int                       mpi_code;   /* mpi return code */
    const H5FD_mpiposix_fapl_t      *fa=NULL;       /* MPIPOSIX file access property list information */
    H5FD_mpiposix_fapl_t      _fa;            /* Private copy of default file access property list information */
    H5P_genplist_t              *plist;         /* Property list pointer */
    h5_stat_t                   sb;             /* Portable 'stat' struct */
#ifdef WIN32
    HFILE filehandle;
    struct _BY_HANDLE_FILE_INFORMATION fileinfo;
    int results;   
#endif
    H5FD_t                     *ret_value=NULL; /* Return value */
    MPI_Comm                    comm_dup=MPI_COMM_NULL;

    FUNC_ENTER_NOAPI(H5FD_mpiposix_open, NULL);

    /* Check arguments */
    if (!name || !*name)
        HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, NULL, "invalid file name");
    if (0==maxaddr || HADDR_UNDEF==maxaddr)
        HGOTO_ERROR(H5E_ARGS, H5E_BADRANGE, NULL, "bogus maxaddr");
    if (ADDR_OVERFLOW(maxaddr))
        HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, NULL, "bogus maxaddr");

    /* Obtain a pointer to mpiposix-specific file access properties */
    if(NULL == (plist = H5P_object_verify(fapl_id,H5P_FILE_ACCESS)))
        HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list");
    if (H5P_FILE_ACCESS_DEFAULT==fapl_id || H5FD_MPIPOSIX!=H5P_get_driver(plist)) {
      _fa.comm = MPI_COMM_SELF; /*default*/
        _fa.use_gpfs = FALSE;
      fa = &_fa;
    } /* end if */
    else {
      fa = H5P_get_driver_info(plist);
      assert(fa);
    } /* end else */

    /* Duplicate the communicator for use by this file. */
    if (MPI_SUCCESS != (mpi_code=MPI_Comm_dup(fa->comm, &comm_dup)))
      HMPI_GOTO_ERROR(NULL, "MPI_Comm_dup failed", mpi_code);

    /* Get the MPI rank of this process and the total number of processes */
    if (MPI_SUCCESS != (mpi_code=MPI_Comm_rank (comm_dup, &mpi_rank)))
        HMPI_GOTO_ERROR(NULL, "MPI_Comm_rank failed", mpi_code);
    if (MPI_SUCCESS != (mpi_code=MPI_Comm_size (comm_dup, &mpi_size)))
        HMPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code);

    /* Build the open flags */
    o_flags = (H5F_ACC_RDWR & flags) ? O_RDWR : O_RDONLY;

    /* Only set the creation flag(s) for process 0 */
    if(mpi_rank==0) {
        if (H5F_ACC_TRUNC & flags)
            o_flags |= O_TRUNC;
        if (H5F_ACC_CREAT & flags)
            o_flags |= O_CREAT;
        if (H5F_ACC_EXCL & flags)
            o_flags |= O_EXCL;
    } /* end if */

    /* Process 0 opens (or creates) the file while the rest of the
     * processes wait.  Then process 0 signals the other processes and they
     * open (never create) the file and all processes proceed.
     */
    /* Process 0 opens (or creates) file and broadcasts result to other processes */
    if(mpi_rank==0) {
        /* Open the file */
        fd=HDopen(name, o_flags, 0666);
    } /* end if */

    /* Broadcast the results of the open() from process 0 */
    /* This is necessary because of the "tentative open" code in H5F_open()
     * where the file is attempted to be opened with different flags from the
     * user's, in order to check for the file's existence, etc.  Here, process 0
     * gets different flags from the other processes (since it is in charge of
     * creating the file, if necessary) and can fail in situations where the
     * other process's file opens would succeed, so allow the other processes
     * to check for that situation and bail out now also. - QAK
     */
    if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&fd, sizeof(int), MPI_BYTE, 0, comm_dup)))
        HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code);

    /* If the file open on process 0 failed, bail out on all processes now */
    if(fd<0)
        HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file");

    /* Other processes (non 0) wait for broadcast result from process 0 and then open file */
    if(mpi_rank!=0) {
        /* Open the file */
        if ((fd=HDopen(name, o_flags, 0666))<0)
            HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file");
    } /* end if */

    /* Process 0 fstat()s the file and broadcasts the results to the other processes */
    if(mpi_rank==0) {
        /* Get the stat information */
        if (HDfstat(fd, &sb)<0)
            HGOTO_ERROR(H5E_FILE, H5E_BADFILE, NULL, "unable to fstat file");
    } /* end if */

    /* Broadcast the results of the fstat() from process 0 */
    if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&sb, sizeof(h5_stat_t), MPI_BYTE, 0, comm_dup)))
        HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code);

#ifdef H5_HAVE_GPFS
    if (fa->use_gpfs) {
        /*
         * Free all byte range tokens. This is a good thing to do if raw data is aligned on 256kB boundaries (a GPFS page is
         * 256kB). Care should be taken that there aren't too many sub-page writes, or the mmfsd may become overwhelmed.  This
         * should probably eventually be passed down here as a property. The gpfs_fcntl() will most likely fail if `fd' isn't
         * on a GPFS file system. */
        struct {
            gpfsFcntlHeader_t   hdr;
            gpfsFreeRange_t     fr;
        } hint;
        memset(&hint, 0, sizeof hint);
        hint.hdr.totalLength = sizeof hint;
        hint.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
        hint.fr.structLen = sizeof hint.fr;
        hint.fr.structType = GPFS_FREE_RANGE;
        hint.fr.start = 0;
        hint.fr.length = 0;
        
        if (gpfs_fcntl(fd, &hint)<0)
            HGOTO_ERROR(H5E_FILE, H5E_FCNTL, NULL, "failed to send hints to GPFS");
    }
#endif  /* H5_HAVE_GPFS */

    /* Build the file struct and initialize it */
    if (NULL==(file=H5MM_calloc(sizeof(H5FD_mpiposix_t))))
        HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed");

#ifdef REPORT_IO
    HDfprintf(stderr, "open:  rank=%d name=%s file=0x%08lx\n", mpi_rank, name, (unsigned long)file);
#endif

    /* Set the general file information */
    file->fd = fd;
    file->eof = sb.st_size;
    file->blksize = sb.st_blksize;

    /* Set this field in the H5FD_mpiposix_t struct for later use */
    file->use_gpfs = fa->use_gpfs;

    /* Set the MPI information */
    file->comm = comm_dup;
    file->mpi_rank = mpi_rank;
    file->mpi_size = mpi_size;

    /* Reset the last file I/O operation */
    file->pos = HADDR_UNDEF;
    file->op = OP_UNKNOWN;

    /* Set the information for the file's device and inode */
#ifdef WIN32
    filehandle = _get_osfhandle(fd);
    results = GetFileInformationByHandle((HANDLE)filehandle, &fileinfo);
    file->fileindexhi = fileinfo.nFileIndexHigh;
    file->fileindexlo = fileinfo.nFileIndexLow;
#else
    file->device = sb.st_dev;
    file->inode = sb.st_ino;
#endif

    /* Indicate success */
    ret_value=(H5FD_t *)file;

done:
    /* Error cleanup */
    if(ret_value==NULL) {
        /* Close the file if it was left open */
        if(fd!=(-1))
            HDclose(fd);
      if (MPI_COMM_NULL != comm_dup)
          MPI_Comm_free(&comm_dup);
    } /* end if */

    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_open() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_close
 *
 * Purpose:     Closes a file.
 *
 * Return:      Success:      Non-negative
 *          Failure:    Negative
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *          Albert Cheng, 2003-04-24
 *          Free the communicator stored.
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_close(H5FD_t *_file)
{
    H5FD_mpiposix_t     *file = (H5FD_mpiposix_t*)_file;
    herr_t      ret_value=SUCCEED;       /* Return value */

    FUNC_ENTER_NOAPI(H5FD_mpiposix_close, FAIL);

    assert(file);
    assert(H5FD_MPIPOSIX==file->pub.driver_id);

    /* Close the unix file */
    if (HDclose(file->fd)<0)
        HGOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close file");

    /* Clean up other stuff */
    MPI_Comm_free(&file->comm);
    H5MM_xfree(file);

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_close() */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_cmp
 *
 * Purpose: Compares two files belonging to this driver using an
 *          arbitrary (but consistent) ordering.
 *
 * Return:  Success:    A value like strcmp()
 *          Failure:    never fails (arguments were checked by the
 *                      caller).
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
static int
H5FD_mpiposix_cmp(const H5FD_t *_f1, const H5FD_t *_f2)
{
    const H5FD_mpiposix_t     *f1 = (const H5FD_mpiposix_t*)_f1;
    const H5FD_mpiposix_t     *f2 = (const H5FD_mpiposix_t*)_f2;
    int ret_value=0;

    FUNC_ENTER_NOAPI(H5FD_mpiposix_cmp, H5FD_VFD_DEFAULT);

#ifdef WIN32
    if (f1->fileindexhi < f2->fileindexhi) HGOTO_DONE(-1);
    if (f1->fileindexhi > f2->fileindexhi) HGOTO_DONE(1);

    if (f1->fileindexlo < f2->fileindexlo) HGOTO_DONE(-1);
    if (f1->fileindexlo > f2->fileindexlo) HGOTO_DONE(1);

#else
#ifdef H5_DEV_T_IS_SCALAR
    if (f1->device < f2->device) HGOTO_DONE(-1);
    if (f1->device > f2->device) HGOTO_DONE(1);
#else /* H5_DEV_T_IS_SCALAR */
    /* If dev_t isn't a scalar value on this system, just use memcmp to
     * determine if the values are the same or not.  The actual return value
     * shouldn't really matter...
     */
    if(HDmemcmp(&(f1->device),&(f2->device),sizeof(dev_t))<0) HGOTO_DONE(-1);
    if(HDmemcmp(&(f1->device),&(f2->device),sizeof(dev_t))>0) HGOTO_DONE(1);
#endif /* H5_DEV_T_IS_SCALAR */

    if (f1->inode < f2->inode) HGOTO_DONE(-1);
    if (f1->inode > f2->inode) HGOTO_DONE(1);
#endif

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_cmp() */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_query
 *
 * Purpose: Set the flags that this VFL driver is capable of supporting.
 *              (listed in H5FDpublic.h)
 *
 * Return:  Success:    non-negative
 *          Failure:    negative
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_query(const H5FD_t UNUSED *_file, unsigned long *flags /* out */)
{
    herr_t ret_value=SUCCEED;

    FUNC_ENTER_NOAPI(H5FD_mpiposix_query, FAIL);

    /* Set the VFL feature flags that this driver supports */
    if(flags) {
        *flags=0;
        *flags|=H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */

        /* Distinguish between updating the metadata accumulator on writes and
         * reads.  This is particularly (perhaps only, even) important for MPI-I/O
         * where we guarantee that writes are collective, but reads may not be.
         * If we were to allow the metadata accumulator to be written during a
         * read operation, the application would hang.
         */
        *flags|=H5FD_FEAT_ACCUMULATE_METADATA_WRITE; /* OK to accumulate metadata for faster writes */

        *flags|=H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
    } /* end if */

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_query() */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_get_eoa
 *
 * Purpose: Gets the end-of-address marker for the file. The EOA marker
 *          is the first address past the last byte allocated in the
 *          format address space.
 *
 * Return:  Success:    The end-of-address marker.
 *          Failure:    HADDR_UNDEF
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
static haddr_t
H5FD_mpiposix_get_eoa(H5FD_t *_file)
{
    H5FD_mpiposix_t *file = (H5FD_mpiposix_t*)_file;
    haddr_t ret_value;          /* Return value */

    FUNC_ENTER_NOAPI(H5FD_mpiposix_get_eoa, HADDR_UNDEF);

    assert(file);
    assert(H5FD_MPIPOSIX==file->pub.driver_id);

    /* Set return value */
    ret_value=file->eoa;

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_get_eoa() */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_set_eoa
 *
 * Purpose: Set the end-of-address marker for the file. This function is
 *          called shortly after an existing HDF5 file is opened in order
 *          to tell the driver where the end of the HDF5 data is located.
 *
 * Return:  Success:    non-negative
 *          Failure:    negative
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_set_eoa(H5FD_t *_file, haddr_t addr)
{
    H5FD_mpiposix_t     *file = (H5FD_mpiposix_t*)_file;
    herr_t ret_value=SUCCEED;   /* Return value */

    FUNC_ENTER_NOAPI(H5FD_mpiposix_set_eoa, FAIL);

    assert(file);
    assert(H5FD_MPIPOSIX==file->pub.driver_id);

    file->eoa = addr;

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpi_posix_set_eoa() */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_get_eof
 *
 * Purpose: Gets the end-of-file marker for the file. The EOF marker
 *          is the real size of the file.
 *
 *          The MPIPOSIX driver doesn't bother keeping this field updated
 *          since that's a relatively expensive operation. Fortunately
 *          the library only needs the EOF just after the file is opened
 *          in order to determine whether the file is empty, truncated,
 *          or okay.
 *
 * Return:  Success:    The end-of-address marker.
 *          Failure:    HADDR_UNDEF
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
static haddr_t
H5FD_mpiposix_get_eof(H5FD_t *_file)
{
    H5FD_mpiposix_t     *file = (H5FD_mpiposix_t*)_file;
    haddr_t ret_value;          /* Return value */

    FUNC_ENTER_NOAPI(H5FD_mpiposix_get_eof, HADDR_UNDEF);

    assert(file);
    assert(H5FD_MPIPOSIX==file->pub.driver_id);

    /* Set return value */
    ret_value=MAX(file->eof,file->eoa);

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_get_eof() */


/*-------------------------------------------------------------------------
 * Function:       H5FD_mpiposix_get_handle
 * 
 * Purpose:        Returns the file handle of MPI-POSIX file driver.
 * 
 * Returns:        Non-negative if succeed or negative if fails.
 * 
 * Programmer:     Raymond Lu
 *                 Sept. 16, 2002
 * 
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
static herr_t  
H5FD_mpiposix_get_handle(H5FD_t *_file, hid_t UNUSED fapl, void** file_handle)
{   
    H5FD_mpiposix_t       *file = (H5FD_mpiposix_t *)_file;
    herr_t                ret_value = SUCCEED;
                             
    FUNC_ENTER_NOAPI(H5FD_mpiposix_get_handle, FAIL);
                                    
    if(!file_handle)
        HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file handle not valid");

    *file_handle = &(file->fd);

done:
    FUNC_LEAVE_NOAPI(ret_value);
}


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_read
 *
 * Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR
 *          into buffer BUF according to data transfer properties in
 *          DXPL_ID using potentially complex file and buffer types to
 *          effect the transfer.
 *
 *          Reading past the end of the file returns zeros instead of
 *          failing.
 *
 * Return:  Success:    Non-negative. Result is stored in caller-supplied
 *                      buffer BUF.
 *          Failure:    Negative, Contents of buffer BUF are undefined.
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_read(H5FD_t *_file, H5FD_mem_t UNUSED type, hid_t UNUSED dxpl_id, haddr_t addr, size_t size,
             void *buf/*out*/)
{
    H5FD_mpiposix_t     *file = (H5FD_mpiposix_t*)_file;
    ssize_t         nbytes;         /* Number of bytes read each I/O call */
    herr_t              ret_value=SUCCEED;

    FUNC_ENTER_NOAPI(H5FD_mpiposix_read, FAIL);

    assert(file);
    assert(H5FD_MPIPOSIX==file->pub.driver_id);
    assert(buf);

    /* Check for overflow conditions */
    if (HADDR_UNDEF==addr)
        HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined");
    if (REGION_OVERFLOW(addr, size))
        HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow");
    if (addr+size>file->eoa)
        HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow");

#ifdef REPORT_IO
    {
        int commrank;
        MPI_Comm_rank(MPI_COMM_WORLD, &commrank);
        HDfprintf(stderr, "read:  rank=%d file=0x%08lx type=%d, addr=%a size=%Zu\n",
                commrank, (unsigned long)file, (int)type, addr, size);
    }
#endif

    /* Seek to the correct location */
    if ((addr!=file->pos || OP_READ!=file->op) &&
            file_seek(file->fd, (file_offset_t)addr, SEEK_SET)<0)
        HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position");

    /*
     * Read data, being careful of interrupted system calls, partial results,
     * and the end of the file.
     */
    while (size>0) {
        do {
            nbytes = HDread(file->fd, buf, size);
        } while (-1==nbytes && EINTR==errno);
        if (-1==nbytes)
            HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed");
        if (0==nbytes) {
            /* end of file but not end of format address space */
            HDmemset(buf, 0, size);
            size = 0;
        } /* end if */
        assert(nbytes>=0);
        assert((size_t)nbytes<=size);
        size -= nbytes;
        addr += (haddr_t)nbytes;
        buf = (char*)buf + nbytes;
    }
    
    /* Update current position */
    file->pos = addr;
    file->op = OP_READ;

done:
    /* Check for error */
    if(ret_value<0) {
        /* Reset last file I/O information */
        file->pos = HADDR_UNDEF;
        file->op = OP_UNKNOWN;
    } /* end if */

    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_read() */


/*-------------------------------------------------------------------------
 * Function:      H5FD_mpiposix_write
 *
 * Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR
 *          from buffer BUF according to data transfer properties in
 *          DXPL_ID using potentially complex file and buffer types to
 *          effect the transfer.
 *
 * Return:  Success:    non-negative
 *          Failure:    negative
 *
 * Programmer:    Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *              Quincey Koziol - 2002/07/18
 *              Added "block_before_meta_write" dataset transfer flag, which
 *              is set during writes from a metadata cache flush and indicates
 *              that all the processes must sync up before (one of them)
 *              writing metadata.
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
            size_t size, const void *buf)
{
    H5FD_mpiposix_t     *file = (H5FD_mpiposix_t*)_file;
    int                 mpi_code;   /* MPI return code */
    ssize_t         nbytes;         /* Number of bytes written each I/O call */
    H5P_genplist_t      *plist;         /* Property list pointer */
    herr_t              ret_value=SUCCEED;      /* Return value */

    FUNC_ENTER_NOAPI(H5FD_mpiposix_write, FAIL);

    assert(file);
    assert(H5FD_MPIPOSIX==file->pub.driver_id);
    assert(H5I_GENPROP_LST==H5I_get_type(dxpl_id));
    assert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER));
    assert(buf);

    /* Check for overflow conditions */
    if (HADDR_UNDEF==addr) 
        HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined");
    if (REGION_OVERFLOW(addr, size))
        HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow");
    if (addr+size>file->eoa)
        HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow");
    
    /* Obtain the data transfer properties */
    if(NULL == (plist = H5I_object(dxpl_id)))
        HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list");

    /* Metadata specific actions */
    if(type!=H5FD_MEM_DRAW) {
        unsigned        block_before_meta_write=0;      /* Whether to block before a metadata write */

        /* Check if we need to syncronize all processes before attempting metadata write
         * (Prevents race condition where the process writing the metadata goes ahead
         * and writes the metadata to the file before all the processes have
         * read the data, "transmitting" data from the "future" to the reading
         * process. -QAK )
         *
         * The only time we don't want to block before a metadata write is when
         * we are flushing out a bunch of metadata.  Then, we block before the
         * first write and don't block for further writes in the sequence.
         */
        if(H5P_exist_plist(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME)>0)
            if(H5P_get(plist,H5AC_BLOCK_BEFORE_META_WRITE_NAME,&block_before_meta_write)<0)
                HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get H5AC property");

        if(block_before_meta_write)
            if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm)))
                HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);

        /* Only one process will do the actual write if all procs in comm write same metadata */
        if (file->mpi_rank != H5_PAR_META_WRITE)
            HGOTO_DONE(SUCCEED) /* skip the actual write */
    } /* end if */

#ifdef REPORT_IO
    {
        int commrank;
        MPI_Comm_rank(MPI_COMM_WORLD, &commrank);
        HDfprintf(stderr, "write: rank=%d file=0x%08lx type=%d, addr=%a size=%Zu %s\n",
                commrank, (unsigned long)file, (int)type, addr, size,
                0==file->naccess?"(FIRST ACCESS)":"");
    }
#endif

    if (0==file->naccess++) {
        /* First write access to this file */
#ifdef H5_HAVE_GPFS
        if (file->use_gpfs) {
            struct {
                gpfsFcntlHeader_t           hdr;
                gpfsMultipleAccessRange_t   mar;
            } hint;
            memset(&hint, 0, sizeof hint);
            hint.hdr.totalLength = sizeof hint;
            hint.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
            hint.mar.structLen = sizeof hint.mar;
            hint.mar.structType = GPFS_MULTIPLE_ACCESS_RANGE;
            hint.mar.accRangeCnt = 1;
            hint.mar.accRangeArray[0].blockNumber = addr / file->blksize;
            hint.mar.accRangeArray[0].start = addr % file->blksize;
            hint.mar.accRangeArray[0].length = MIN(file->blksize-hint.mar.accRangeArray[0].start, size);
            hint.mar.accRangeArray[0].isWrite = 1;
            if (gpfs_fcntl(file->fd, &hint)<0)
                HGOTO_ERROR(H5E_FILE, H5E_FCNTL, NULL, "failed to send hints to GPFS");
        }
#endif  /* H5_HAVE_GPFS */
    }
    
    /* Seek to the correct location */
    if ((addr!=file->pos || OP_WRITE!=file->op) &&
            file_seek(file->fd, (file_offset_t)addr, SEEK_SET)<0)
        HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to seek to proper position");

    /*
     * Write the data, being careful of interrupted system calls and partial
     * results
     */
    while (size>0) {
        do {
            nbytes = HDwrite(file->fd, buf, size);
        } while (-1==nbytes && EINTR==errno);
        if (-1==nbytes)
            HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed");
        assert(nbytes>0);
        assert((size_t)nbytes<=size);
        size -= nbytes;
        addr += (haddr_t)nbytes;
        buf = (const char*)buf + nbytes;
    } /* end while */

    /* Update current last file I/O information */
    file->pos = addr;
    file->op = OP_WRITE;

done:
    /* Check for error */
    if(ret_value<0) {
        /* Reset last file I/O information */
        file->pos = HADDR_UNDEF;
        file->op = OP_UNKNOWN;
    } /* end if */
    /* Guard against getting into metadata broadcast in failure cases */
    else {
        /* if only one process writes, need to broadcast the ret_value to other processes */
        if (type!=H5FD_MEM_DRAW) {
            if (MPI_SUCCESS != (mpi_code= MPI_Bcast(&ret_value, sizeof(ret_value), MPI_BYTE, H5_PAR_META_WRITE, file->comm)))
                HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
        } /* end if */
    } /* end else */

    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_write() */


/*-------------------------------------------------------------------------
 * Function:    H5FD_mpiposix_flush
 *
 * Purpose:     Makes sure that all data is on disk.  This is collective.
 *
 * Return:      Success:      Non-negative
 *          Failure:    Negative
 *
 * Programmer:  Quincey Koziol
 *              Thursday, July 11, 2002
 *
 * Modifications:
 *
 *-------------------------------------------------------------------------
 */
static herr_t
H5FD_mpiposix_flush(H5FD_t *_file, hid_t UNUSED dxpl_id, unsigned UNUSED closing)
{
    H5FD_mpiposix_t     *file = (H5FD_mpiposix_t*)_file;
#ifdef WIN32
    HFILE filehandle;   /* Windows file handle */
    LARGE_INTEGER li;   /* 64-bit integer for SetFilePointer() call */
#endif /* WIN32 */
    int                 mpi_code;   /* MPI return code */
    herr_t              ret_value=SUCCEED;

    FUNC_ENTER_NOAPI(H5FD_mpiposix_flush, FAIL);

    assert(file);
    assert(H5FD_MPIPOSIX==file->pub.driver_id);

    /* Extend the file to make sure it's large enough */
    if(file->eoa>file->last_eoa) {
        /* Use the round-robin process to truncate (extend) the file */
        if(file->mpi_rank == H5_PAR_META_WRITE) {
#ifdef WIN32
            /* Map the posix file handle to a Windows file handle */
            filehandle = _get_osfhandle(fd);

            /* Translate 64-bit integers into form Windows wants */
            /* [This algorithm is from the Windows documentation for SetFilePointer()] */
            li.QuadPart = file->eoa;
            SetFilePointer((HANDLE)filehandle,li.LowPart,&li.HighPart,FILE_BEGIN);
            if(SetEndOfFile((HANDLE)filehandle)==0)
                HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to extend file properly");
#else /* WIN32 */
            if(-1==file_truncate(file->fd, (file_offset_t)file->eoa))
                HGOTO_ERROR(H5E_IO, H5E_SEEKERROR, FAIL, "unable to extend file properly");
#endif /* WIN32 */
        } /* end if */

        /* Don't let any proc return until all have extended the file.
         * (Prevents race condition where some processes go ahead and write
         * more data to the file before all the processes have finished making
         * it the shorter length, potentially truncating the file and dropping
         * the new data written)
         */
        if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(file->comm)))
            HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);

        /* Update the 'last' eoa and eof values */
        file->last_eoa=file->eoa;
        file->eof = file->eoa;

        /* Reset last file I/O information */
        file->pos = HADDR_UNDEF;
        file->op = OP_UNKNOWN;
    } /* end if */

done:
    FUNC_LEAVE_NOAPI(ret_value);
} /* end H5FD_mpiposix_flush() */

#endif /*H5_HAVE_PARALLEL*/


Generated by  Doxygen 1.6.0   Back to index