Source code for siman.database

#Copyright Aksyonov D.A
from __future__ import division, unicode_literals, absolute_import 

import shelve, sys, datetime, shutil, tempfile, os, json, re, glob
import json

try:
    import pandas as pd
except:
    print('no pandas')

from siman import header
from siman.header import runBash, printlog
from siman.small_functions import makedir
from siman.set_functions import init_default_sets
from siman.functions import invert
# from siman import classes

"""
Module contains utilities for project database management
and working with external databases

TODO:
1) in write_database() make update of history file more clever
"""
calc_key       = 'calc'
conv_key       = 'conv'
varset_key     = 'varset'
history_key    = 'history'
struct_des_key = 'struct_des'


[docs]def read_database(scratch = False, init_sets = 0):
    """
    Read database of calculations

    INPUT:
        scratch - not used
        init_sets - required to reinit defaults sets in init_default_sets function
    RETURN:
        calc   - dict, contains all calculations of the project
        conv   - dict, convergence sequences
        varset - dict, parameter sets of the project
        size_on_start, int - not used now
    """

    # databasefile = 'calc.s' #was used with python2
    # databasefile = 'calc.gdbm'
    databasefile3 = 'calc.gdbm3'
    # if header.RAMDISK:
    #     databasefile3 = header.RAMDISK+databasefile3

    # sys.exit()


    # if scratch == True: databasefile =   '/scratch/aksenov/calc.s'
    
    printlog("\nLaunch at "+str( datetime.datetime.today() )+'\n')
    
    # mod = __import__("gdbm")
    # d = shelve.Shelf(mod.open(databasefile, protocol=1))

    with shelve.open(databasefile3, protocol = 3) as d:

        try: 
            header.conv = d[conv_key] #dictionary of convergence lists
        except KeyError:
            printlog( "There is no dictionary of convergence lists. I create new"); 
            header.conv = {}   
        


        try: 
            header.varset = d[varset_key] 
        except KeyError:
            printlog( "There is no dictionary of inputsets. I create new");  
            header.varset = {} 
        


        try: 
            header.history = d[history_key] 
        except KeyError:
            header.history = ['Project started on '+ str( datetime.date.today() ) ]

            printlog( "There is still no history in database. The list is in header module ");
        

        try: 
            header.struct_des = d[struct_des_key] 
        except KeyError:
            printlog( "There is no struct_des in database. The dict is in header module "); 

    #print history
    init_default_sets(init_sets)


    return header.conv, header.varset, sys.getsizeof(d)


[docs]def write_database(calc = None, conv = None, varset = None, size_on_start = None):
    """
    The function writes main dictionaries to database file calc.s
    Also creates copy of calc.s

    INPUT:
        calc - dict, contains all calculations of the project
        conv - dict, convergence sequences
        varset - dict, parameter sets of the project
        size_on_start - not used now

    RETURN:
        None    

    """
    #size_on_finish = sys.getsizeof(dbase)
    #if size_on_finish != size_on_start:
    # runBash("cp calc.s calc_copy.s") #create copy before writing
    databasefile3 = 'calc.gdbm3'

    # if header.RAMDISK:
    #     databasefile3 = header.RAMDISK+databasefile3
    size = os.path.getsize


    



    if os.path.isfile(databasefile3) and os.path.isfile('calc_copy.gdbm3'):
        if size(databasefile3) < size('calc_copy.gdbm3') - 200000:

            print(size(databasefile3), size('calc_copy.gdbm3'))
            printlog('Error! actual database file is smaller than reserve copy, something is wrong! Check')

        else:
            ''
            shutil.copyfile(databasefile3, 'calc_copy.gdbm3')
    


    if 0:
        d = shelve.open('calc.s', protocol=1) #Write database of calculations
        d[calc_key]       = calc
        d[conv_key]       = conv
        d[varset_key]     = varset
        d[history_key]    = header.history
        d[struct_des_key] = header.struct_des 
        d.close()

    python2 = False
    if python2:
        import gdbm# use in python2

        d = shelve.Shelf(gdbm.open('calc.gdbm', 'c'), protocol=1) #Write dbm database for python3
        d[unicode(calc_key)]       = calc
        d[unicode(conv_key)]       = conv
        d[unicode(varset_key)]     = varset
        d[unicode(history_key)]    = header.history
        d[unicode(struct_des_key)] = header.struct_des 
        d.close()
    else: #python3 
        import dbm

        d = shelve.Shelf(dbm.open(databasefile3, 'c'), protocol = 3) #Write dbm database for python3 / it was n key, now c, working on new python
        
        # print(header.varset['test'])
        d[conv_key]       = header.conv
        d[varset_key]     = header.varset
        d[history_key]    = header.history
        d[struct_des_key] = header.struct_des 
        # print(d[struct_des_key]['test'])


        d.close()   

        if 0: #please run me from time to time to reduce the size of the database file, calc
            with dbm.open(databasefile3, 'w') as d:
                d.reorganize()


        printlog('Opening ', header.calc_database, 'for writing')

        with shelve.Shelf(dbm.open(header.calc_database, 'c'), protocol = 3) as d:
            for key in header.db:
                printlog('saving key:', key, imp = '')
                # print(key)
                # print(key, header.calc[key].inh_id)
                d[str(key)] = header.db[key]
        
        if header.reorganize: #please run me from time to time to reduce the size of the database file
            with dbm.open(header.calc_database, 'w') as d:
                d.reorganize()


    printlog("\nEnd of work at "+str(datetime.datetime.now())+'\n')

    try:
        header.log.close()
    except:
        pass

    #Update history file
    with  open('history','w') as his:
        #print history
        for i in header.history:
            #print i
            his.write(i+"\n")
    
    print("\nDatabase has been successfully updated\n")
    
    return






[docs]def get_from_database(x1, x2, mat, inquiry_keys = None, silent = None, ssh_object = None):
    """
    inquiry_keys (list) - list of keys that should exist in filenames both for x1 and x2
    ssh_object (SSHTools) - ssh object based on paramiko with access details

    """
    from siman.classes import CalculationVasp

    def check(key, inquiry_keys):
        return all([k in key for k in inquiry_keys])


    path2database = '/home/Data/CEStorage/'

    hash_dict_file = 'hash_dict.json'

    cluster_path2hash = os.path.join(path2database, hash_dict_file)

    if inquiry_keys is None:
        inquiry_keys = []

    if ssh_object:
        # ssh_object.get()
        tempdir = tempfile.gettempdir()
        local_path2hash = os.path.join(tempdir, hash_dict_file)

        ssh_object.get(cluster_path2hash,  local_path2hash  )

        # sys.exit()

    with open(local_path2hash, 'r') as fp:
        hash_dict = json.load(fp)

    # print(hash_dict)
    x1s = []
    x2s = []
    # print(hash_dict)
    for key, val in hash_dict.items():
        if check(key, inquiry_keys+[x1, mat]):
            x1s.append(key)

        if check(key, inquiry_keys+[x2, mat]):
            x2s.append(key)

    x1s = sorted(x1s, key = lambda el: len(el) )
    x2s = sorted(x2s, key = lambda el: len(el) )


    for xi, xis in (x1, x1s), (x2, x2s):
        if not silent:
            print('\nFiles for',xi,':')
        for i, f in enumerate(xis):
            if not silent:
            
                print(i+1,f)


    if len(x1s) == 0 or len(x2s) == 0:
        print('No information in database for this inquire:', x1, x2, mat, str(inquiry_keys) )
        return None, None
    
    key1 = x1s[0]
    key2 = x2s[0]

    if not silent:

        print('\nI choose first entries for both concentrations:',key1, 'and', key2,'\n')
    # print('Use *inquiry_keys* arg to clarify the output results.\n')

    #get files
    loc1 = os.path.join(tempdir, hash_dict[key1])
    loc2 = os.path.join(tempdir, hash_dict[key2])
    makedir(loc1)
    makedir(loc2)
    # print()/

    ssh_object.get(os.path.join(path2database, hash_dict[key1]), loc1  )
    ssh_object.get(os.path.join(path2database, hash_dict[key2]), loc2  )



    cl1 = CalculationVasp().deserialize(loc1)
    cl2 = CalculationVasp().deserialize(loc2)

    return cl1, cl2




[docs]def push_figure_to_archive(local_figure_path, caption, figlabel = None, autocompl = True ):
    shutil.copy(local_figure_path, header.path_to_images)
    print_and_log('push_figure_to_archive():', local_figure_path, 'copied to', header.path_to_images, imp = 'y')
    
    name_without_ext =   '.'.join( os.path.basename(local_figure_path).split('.')[:-1]) 
    figfile = '{{'+name_without_ext+'}}'

    if not figlabel:
        figlabel = '.'.join(name_without_ext.split('.')[:-1])
    
    if autocompl:
        caption+=' for '+figlabel 

    tex_text = \
    ("\\begin{{figure}} \n\includegraphics[width=\columnwidth]{{{:s}}}\n"
    "\caption{{\label{{fig:{:s}}} {:s} }}\n"
    "\end{{figure}}\n").format(figfile, figlabel, caption )


    # print (tex_text)
    with open(header.project_conf.path_to_paper+'/auto_fig.tex', 'a+', newline = '') as f:
        f.seek(0)
        a = f.read()
        # print (a)
        if tex_text not in a:
            f.write(tex_text)
    return





[docs]def read_cvs_database(columns):
    """
    Allows to read cvs file with experimental results
    """

    dfs = pd.read_csv(r'database/literature.csv')[columns]
    dfs.drop(0, inplace=True)
    dfs.dropna(inplace=True)
    dfs.set_index('is', inplace = True)
    dfs = dfs.apply(lambda x: pd.to_numeric(x, errors='ignore'))
    # dfs['owner'] = 'world'    
    # print(dfs.sort_index())
    # print(dfs)
    # sys.exit()
    return dfs




[docs]def add_to_archive_database(cl, subgroup):
    """
    cl is Calculation which should be added to database
    subgroup (str) - subgroup folder

    """

    from pymatgen.core.composition import Composition
    from pymatgen.io.cif import CifWriter
    from pymatgen.symmetry.analyzer import SpacegroupAnalyzer


    join = os.path.join
    basename = os.path.basename
    dirname  = os.path.dirname


    save_format = 'azh'
    dbpath = header.PATH2DATABASE
    it = cl.id[0]

    # print(cl.path)
    sub_folder = cl.path['output'].split('/')[0] # usually basic chemical formula
    # sub_folder = header.struct_des[it].sfolder.split('/')[0] # usually basic chemical formula

    print( 'Processing ', cl.id)
    cl.read_results()

    if '4' not in cl.state:
        return
    st = cl.end

    # print(cl.end.typat)
    # sys.exit()


    if 1:
        #determine x
        #universal method, supports several alkali elements
        #requires cl.base_formula in 'Na2FePO4F' format
        #requires pymatgen, would be nice to remove dependency
        cmb = Composition(cl.base_formula)
        cm  = st.get_pm_composition()

        rc = cl.end.get_reduced_composition().as_dict() #reduced composition dict
        rcb = cmb.reduced_composition.as_dict()   
        
        # print(rc, rcb)

        alk = list(set(st.get_specific_elements(header.ALKALI_ION_ELEMENTS))) # list of unique alkali elements
        tra = list(set(st.get_specific_elements(header.TRANSITION_ELEMENTS)))
        # print(alk, tra)
        el_for_norm = tra[0] #first element used for normalization

        nnb = rcb[el_for_norm] #number of norm elements in base
        nn  = rc[el_for_norm]      #number of norm elements in interesting structure 
        # print(nb, n)
        mul = nn/nnb # multiplier that garanties normalization

        # print(rcb)
        nab = sum([rcb[invert(z)] for z in header.ALKALI_ION_ELEMENTS if invert(z) in rcb])
        

        na  = sum([rc[el] for el in alk])
        x = na / mul / nab

        # determine formula
        # cm = st.get_pm_composition() #get pymatgen composition class
        # print( (cm/4).formula)

        # print('Material detected:', formula, 'sub_folder:', sub_folder)

        #obtain base without alk
        formula = (cm.reduced_composition/mul).formula
        # formula = formula.replace('1 ', '').replace(' ', '')
        # print(formula)
        cl.formula = formula
        # print(Composition('Na0.75'))
        print('Material detected:', formula, 'sub_folder:', sub_folder)

        # sys.exit()





    if 0:
        #Old method, not robust at all!
        #determine x for alkali ion from structure name
        parsed = re.findall(r'([A-Z][a-z]*)(\d*)', formula)
        parsed = [(el, x if x else '1') for (el, x) in parsed ]
        print(parsed)
        print('detected element is ', parsed[0][0])

        if parsed[0][0] in [invert(z) for z in header.ALKALI_ION_ELEMENTS]:
            x = parsed[0][0]

            if hasattr(cl, 'max_alk_ion_content'):
                x = float(x)/cl.max_alk_ion_content
            else:
                x = '1'

        else:
            x = '0'




    

    sfolder = os.path.join(dbpath, sub_folder)

    name = []

    if 'azh' in save_format:
        #1. Single point calculation of total energy
        # print(sfolder)
        makedir( join( sfolder, 'dummy')  )

        if x < 1:
            x = int(round(100*x, 0))
        else:
            x = int(round(x, 0))

        print('Concentration x:', x)

        name.append('x'+str(x))
        # sys.exit()
        
        # if formula in ['LiCoO2', 'LiTiO2', 'LiFePO4', 'NaFePO4', 'LiMnPO4', 
        # 'LiNiO2', 'LiTiS2', 'LiMn2O4', 'LiVP2O7', 'LiVPO4F', 
        # 'NaMnAsO4', 'Na2FePO4F', 'Na2FeVF7', 'KFeSO4F', 'NaLiCoPO4F', 'KVPO4F' ]: 
        sfolder = join(sfolder, subgroup)
        makedir( join(sfolder,'dummy') )

        cl.set.update()


        # print(cl.potcar_lines)
        potcar1_m = cl.potcar_lines[0][0]

        if '_' in potcar1_m:
            (pot, _) = potcar1_m.split('_')
        else:
            pot = potcar1_m

        xc = cl.xc_inc
        if '-' in xc:
            xc = cl.xc_pot

        if xc == 'PE':
            func = 'PBE'
        elif xc == 'CA':
            func = 'LDA'
        elif xc == 'PS':
            func = 'PBEsol'
        else:
            print('uknown xc type:', xc)
            sys.exit()

        
        if cl.set.spin_polarized:
            func = 'U'+func #unrestricted

        u_ramping_flag = False
        if hasattr(cl.set, 'u_ramping_nstep') and cl.set.u_ramping_nstep:
            func += '-UR'
            u_ramping_flag = True

        elif cl.set.dftu:
            func += '-U'
        else:
            func += '-'

        func+=pot.lower()
        ecut = str(round(cl.set.ecut ))

        func+=ecut
        # print(func)
        name.append(func)

        name.extend([it.replace('.', '_')]+[cl.id[1]]+[str(cl.id[2])])

        name_str = '_'.join(name)
        # print('_'.join(name) )

        # sys.exit()



        outcar_name = name_str+'.out'

        shutil.copyfile(cl.path["output"], join(sfolder, outcar_name)  )

        if u_ramping_flag:
            print(cl.associated_outcars)
            for i, u_outcar in enumerate(cl.associated_outcars[:-1]): # except the last one, which was copied above
                u = u_outcar.split('.')[1]
                # print(u)
                path_to_outcar = join( dirname(cl.path["output"]), u_outcar )

                cl.read_results(load = 'o', choose_outcar = i+1, only_load  = 1)
                
                shutil.copyfile(path_to_outcar, join(sfolder, name_str+'_'+u+'.out')  )

            # sys.exit()


        cl.end.write_xyz(path = sfolder, filename =  name_str)


        pickle_file = cl.serialize(os.path.join(sfolder, 'bin', name_str) )
        # cl




        #write input, problem with fitted version 100, which does not have input geometry, since they are created on cluster
        # makedir(sfolder+'input/dummy')
        # shutil.copyfile(cl.path["input_geo"], sfolder+'input/'+name_str+'.geo')


        st_mp = cl.end.convert2pymatgen()
        sg_before =  st_mp.get_space_group_info() 
        # from pymatgen.symmetry.finder import SymmetryFinder
        # sf = SymmetryFinder(st_mp_prim)
        symprec = 0.1
        sf = SpacegroupAnalyzer(st_mp, symprec = symprec)

        st_mp_prim = sf.find_primitive()
        # st_mp_prim = sf.get_primitive_standard_structure()
        # st_mp_prim = sf.get_conventional_standard_structure()


        # st_mp_conv = sf.get_conventional_standard_structure()
        # print(st_mp_conv)
        # print(st_mp_conv.lattice.matrix)
        # print(st_mp_prim)
        # print(st_mp_prim.lattice)

        sg_after = st_mp_prim.get_space_group_info()

        if sg_before[0] != sg_after[0]:
            printlog('Attention! the space group was changed after primitive cell searching', sg_before, sg_after)
            printlog('I will save supercell in cif and reduce symprec to 0.01')
            st_mp_prim = st_mp
            symprec = 0.01

        if st_mp_prim:
            cif = CifWriter(st_mp_prim, symprec = symprec)
            cif_name =  name_str+'.cif'
            cif.write_file(  join(sfolder, cif_name)  )
            printlog('Writing cif', cif_name)

        if 0:
            #get multiplication matrix which allows to obtain the supercell from primitive cell.
            #however this matrix is not integer which is not convinient.
            print(st_mp.lattice.matrix.round(2))
            print(st_mp_prim.lattice.matrix.round(2))

            mul_matrix = np.dot(st_mp.lattice.matrix, np.linalg.inv(st_mp_prim.lattice.matrix) )

            print(mul_matrix.round(1))

            rprimd = np.dot(mul_matrix, st_mp_prim.lattice.matrix  )

            print(rprimd.round(2))

        #write chg
        if 1:
            path_to_chg = cl.get_chg_file('CHGCAR')
            if path_to_chg:
                makedir( join(sfolder,'bin','dummy') )
                printlog('path to chgcar',path_to_chg)
                gz = '.gz'
                if gz not in path_to_chg:
                    gz = ''
                shutil.copyfile(path_to_chg, join( sfolder, 'bin', name_str+'.chg'+gz)  )


        #write dos
        if subgroup in ['dos', 'DOS']:
            DOSCAR = cl.get_file('DOSCAR', nametype = 'asoutcar'); 
            if DOSCAR:
                printlog('path to DOSCAR', DOSCAR)
                gz = '.gz'
                if gz not in path_to_chg:
                    gz = ''
                shutil.copyfile(DOSCAR, join( sfolder, 'bin', name_str+'.dos'+gz)  )                

        if subgroup in ['BAD']: #bader
            cl.get_bader_ACF()
            acf = cl.get_file(basename(cl.path['acf']))
            # print(acf)
            # sys.exit() 
            if acf:
                shutil.copyfile(acf, join( sfolder, 'bin', name_str+'.acf')  )                



        if subgroup in ['ph', 'PH']: #bader
            # cl.get_bader_ACF()
            xml = cl.get_file('vasprun.xml', nametype = 'asoutcar')
            # print(acf)
            # sys.exit() 
            if xml:
                shutil.copyfile(xml, join( sfolder, 'bin', name_str+'.xml')  )                










        #make dat
        #incars
        makedir(  join(sfolder, 'dat','dummy')  )
        incars = glob.glob(  join(cl.dir, '*INCAR*')  )
        # print(incars)
        for inc in incars:

            dest = join(sfolder, 'dat')
            # inc_name = 
            if not os.path.exists(join(dest, basename(inc) )):
                shutil.copy(  inc, dest  )


        #kpoints
        if it in header.struct_des:
            with open(  join(sfolder, 'dat', 'kpoints_for_kspacings.json'), 'w', newline = '') as fp:
                json.dump(header.struct_des[it].ngkpt_dict_for_kspacings, fp,)
        else:
            printlog('Warning!, it not in struct_des:',it )
        # print(cl.set.toJSON())


        #prepare for neb
        # makedir(sfolder+'neb_'+name_str+'/dummy')

    return