Source code for httk.httkio.cif

#
#    The high-throughput toolkit (httk)
#    Copyright (C) 2012-2015 Rickard Armiento
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Affero General Public License as
#    published by the Free Software Foundation, either version 3 of the
#    License, or (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU Affero General Public License for more details.
#
#    You should have received a copy of the GNU Affero General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os, sys, re
from httk.core import is_string
from collections import OrderedDict

if sys.version_info[0] == 3:
    maketrans_ = str.maketrans
else:
    import string
    maketrans_ = string.maketrans

import httk

from httk.core import *


def _read_cif_rewind_if_needed(f, row, done_fields):
    splitstr = row.lstrip().split(None, done_fields)
    if len(splitstr) > 1:
        rest = splitstr[-1]
        if rest.strip() != "":
            f.rewind(rest)
            return True
        return False
    else:
        return False


def _read_cif_loop(f, pragmatic=True, use_types=False):
    #print("Read cif loop")
    noteol = False
    loop_data = OrderedDict()
    header = []
    for row in f:
        striprow = row.strip()
        lowrow = striprow.lower()
        if lowrow.startswith("_"):
            loop_data[lowrow[1:]] = []
            header += [lowrow[1:]]
            noteol = _read_cif_rewind_if_needed(f, row, 1)
        else:
            f.rewind()
            break

    while True:
        for i in range(len(loop_data)):
            try:
                row = next(f)
            except StopIteration:
                break
            if row.isspace():
                continue
            striprow = row.strip()
            lowrow = striprow.lower()
            if not row or row.startswith("_") or lowrow.startswith("data_") or lowrow.startswith("loop_"):
                f.rewind()
                break
            f.rewind()
            val, noteol = _read_cif_data_value(f, noteol, pragmatic, use_types, inloop=True)
            loop_data[header[i]].append(val)
        else:
            continue
        break
    return loop_data


def _read_cif_data_value(f, noteol, pragmatic=True, use_types=False, inloop=False):
    #print("Read cif data value")
    data_value = None
    for row in f:
        #print("read_cif_data_value_row:",row)
        striprow = row.strip()
        if striprow == "":
            noteol = False
            continue
        elif (not noteol) and row.startswith(';'):
            folded = False
            newline = False
            data_value = ""
            if row[1] == "\\" and row[2:].rstrip("\r\n") == "":
                folded = True
            elif row[1:].isspace():
                if not pragmatic:
                    data_value = row.lstrip().rstrip('\r\n')
                    newline = True
            else:
                data_value = row.lstrip()[1:].rstrip('\r\n')
                newline = True
            stripirow = ""
            for irow in f:
                stripirow = irow.strip()
                if irow.startswith(';'):
                    break
                if newline:
                    data_value += '\n'
                    newline = True
                if folded and irow.rstrip('\r\n').endswith("\\"):
                    data_value += irow.rstrip('\r\n').rstrip("\\")
                    newline = False
                else:
                    data_value += irow.rstrip('\r\n')
                    newline = True
            if len(stripirow) > 1:
                f.rewind(stripirow[1:])
                noteol = True
            else:
                noteol = False
            break
        elif striprow.startswith("'") or striprow.startswith('"'):
            # The cif quoting rules are ... weird. Quotes are "escaped" if they are not followed by whitespace.
            quote = striprow[0]
            starti = 1
            for chari in range(1, len(striprow)-1):
                if striprow[chari] == quote and str(striprow[chari+1]).isspace():
                    endi = chari
                    endq = chari+1
                    break
            else:
                if striprow[-1] != quote:
                    starti = 0
                    endi = len(striprow)
                    endq = len(striprow)
                else:
                    endi = len(striprow)-1
                    endq = len(striprow)
            data_value = striprow[starti:endi]
            if endq != len(striprow):
                f.rewind(striprow[endq:])
                noteol = True
            else:
                noteol = False
            break
        else:
            # Unquoted string
            if pragmatic and not inloop:
                # In pragmatic mode, if we are not in a loop and there is more than one data value
                # separated by whitespace, read all of it. This should always be ok to do, since
                # multiple data values in this situation would be an
                # error in the file otherwise, but if there is whitespace + underscore/data_/loop_ we parse that
                # as a new symbol, since otherwise we COULD misread valid files (with very weird formatting...).
                splitstr = re.split("\s+_|\s+data_|\s+loop_", striprow, maxsplit=1)
            else:
                splitstr = striprow.split(None, 1)
            data_value = splitstr[0].strip()
            rightside = ""
            if len(splitstr) > 1:
                rightside = splitstr[1].strip()
            if rightside != "":
                f.rewind(rightside)
                noteol = True
            else:
                noteol = False
            break
            if use_types:
                if _cif_is_int(data_value):
                    data_value = int(data_value.replace("(", "").replace(")", ""))
                elif _cif_is_float(data_value):
                    data_value = float(data_value.replace("(", "").replace(")", ""))

    return data_value, noteol


def _read_cif_data_block(f, pragmatic=True, use_types=False):
    #print("Read cif data block")
    data_items = OrderedDict()
    loops = 0
    for row in f:
        #print("Read data block read:",row)
        striprow = row.strip()
        lowrow = striprow.lower()
        if striprow.startswith("#"):
            continue
        elif lowrow.startswith("data_"):
            f.rewind()
            return data_items
        elif lowrow.startswith("loop_"):
            _read_cif_rewind_if_needed(f, row, 1)
            loopdata = _read_cif_loop(f, pragmatic, use_types)
            data_items['loop_'+str(loops)] = list(loopdata.keys())
            loops += 1
            data_items.update(loopdata)
        elif striprow.startswith(";"):
            # Multi-line string that we've failed to tie to a name, lets just skip it, maybe we should warn
            for irow in f:
                if irow.rstrip() == ";":
                    break
        elif striprow.startswith("_"):
            lowsplit = lowrow.split()
            data_name = lowsplit[0][1:]
            if len(lowsplit) > 1:
                noteol = True
                rightside = striprow.split(None, 1)[1].strip()
                f.rewind(rightside)
            else:
                noteol = False
            data_value, noteol = _read_cif_data_value(f, noteol, pragmatic, use_types, inloop=False)
            data_items[data_name] = data_value
    return data_items


[docs]def read_cif(ioa, pragmatic=True, use_types=False):
    """
    Generic cif reader, given a filename / ioadapter it places all data in a python dictionary.

    It returns a tuple: (header, list)
    Where list are pairs of data blocks names and data blocks

    Each data block is a dictionary with tag_name:value

    For loops, value is another dictionary with format column_name:value

    The optional parameter pragmatic regulates handling of some counter-intuitive aspects of the cif specification, where
    the default pragmatic=True handles these features the way people usually use them, whereas pragmatic=False means
    to read the cif file precisely according to the spec. For example, in a multiline text field::

        ;
        some text
        ;

    Means the string '\\nsome text'. For this specific case pragmatic=True removes the leading newline.

    set use_types to True to convert things that look like floats and integers to those respective types
    """
    ioa = IoAdapterFileReader.use(ioa)
    f = basic.rewindable_iterator(ioa.file)
    header = ""
    datalist = []
    for row in f:
        if row.strip().startswith("#"):
            header += row
        else:
            f.rewind()
            break

    for row in f:
        lowrow = row.strip().lower()
        if lowrow.startswith("data_"):
            data_block_name = lowrow.partition('_')[2].split()[0].strip()
            _read_cif_rewind_if_needed(f, row, 1)
            data_block = _read_cif_data_block(f, pragmatic, use_types)
            datalist += [(data_block_name, data_block)]

    ioa.close()
    return datalist, header

_cif_ordinary_char = "!%&()*+,-./0123456789:<=>?@ABCDEFGHIHJKLMNOPQRSTUVWXYZ\^`abcdefghijklmnopqrstuvwxyz{|}~"
_cif_non_blank_char = _cif_ordinary_char+'"'+"#$"+"'"+"_"+";[]"
_cif_text_lead_char = _cif_ordinary_char+'"'+"#$"+"'"+"_ \t[]"
_cif_any_print_char = _cif_ordinary_char+'"'+"#$"+"'"+"_ \t;[]"

_cif_non_blank_char_table = maketrans_(_cif_non_blank_char, ' ' * len(_cif_non_blank_char))
_cif_helper_table = maketrans_('', '')

# Python 3 specific
if sys.version_info[0] == 3:
    _cif_non_blank_char_table = maketrans_(_cif_non_blank_char, _cif_non_blank_char)
    _cif_unicode_translation_table = {}
    for i in range(sys.maxunicode+1):
        _cif_unicode_translation_table[i] = None
    for key, value in _cif_non_blank_char_table.items():
        _cif_unicode_translation_table[key] = value

_cif_integer_regex = re.compile('^[+-]?[0-9]+$')
_cif_float_regex = re.compile('^[+-]?[0-9]+[eE][+-]?[0-9]+|([+-]?[0-9]*\.[0-9]+|[+-]?[0-9]\.)([eE][+-]?[0-9]+)?$')
_cif_simplestring_regex = re.compile('^[A-Za-z0-9()][A-Za-z0-9()+-]*$')

def _cif_validate_name(name_unfiltered, context=None):
    if context is not None:
        context = context+": "+name_unfiltered
    name = _cif_validate_non_blank_char(name_unfiltered, context)
    if len(name) > 75:
        sys.stderr.write("***Warning: write_cif: name length > 75, surplus characters removed in "+context+": "+name_unfiltered)
        name = name[:75]
    return name


def _cif_is_float(data_value):
    return (_cif_float_regex.match(data_value) is not None)


def _cif_is_simplestring(data_value):
    return (_cif_simplestring_regex.match(data_value) is not None)


def _cif_is_int(data_value):
    return (_cif_integer_regex.match(data_value) is not None)


def _cif_validate_non_blank_char(s, context=None):
    if sys.version_info[0] == 3:
        out = s.translate(_cif_unicode_translation_table)
    else:
        out = s.translate(_cif_helper_table, _cif_non_blank_char_table)
    if out != s:
        if context is not None:
            sys.stderr.write("***Warning: write_cif: non-permitted characters in "+context+" removed.")
        else:
            sys.stderr.write("***Warning: write_cif: non-permitted characters removed.")
    return out


def _cif_write_semicolontextfield(f, lines, noteol, max_line_length):
    if noteol:
        f.write("\n")
        noteol = False
    for i in range(len(lines)):
        lines[i] = lines[i].rstrip("\r\n")
        if lines[i][0] == ';':
            sys.stderr.write("***Warning: write_cif: had to insert space before semicolon at the start of a line of a multi-line string to fulfill arcane quoting rules.")
            lines[i] = ' '+lines[i]
        if len(lines[i]) > max_line_length:
            f.write(";\\"+"\n")
            break
    else:
        f.write(";")
    for line in lines:
        if len(line) > max_line_length:
            sublines = [line[i:i+max_line_length-2] for i in range(0, len(line), max_line_length-2)]
            # Handle a wonderful corner case: the line splitting for length creates lines that start with one, or more, semi-colons..., sigh...
            for i in range(1, len(sublines)):
                if sublines[i][0] == ";":
                    if len(sublines[i]) > 1 and sublines[i][1] != ";":
                        # If its just a single semi-colon, move it to the previous line, which we saved space for by splitting at max_line_length-2
                        sublines[i-1] += ";"
                        sublines[i] = sublines[i][1:]
                    else:
                        # Multiple semi-colons in a row, or a semi-colon + newline, this is a possibly unresolvable case (think long string of only semi-colons)
                        # fudge a solution by inserting a space
                        sys.stderr.write("***Warning: write_cif: had to insert space before semicolon in a long string to fulfill arcane quoting rules.")
                        sublines[i] = " "+sublines[i]
            for subline in sublines:
                f.write(subline+"\\"+"\n")
        else:
            f.write(line+"\n")

    f.write(";\n")
    return False


def _cif_write_data_value(f, orig_data_value, noteol, max_line_length, use_types, inloop):
    if orig_data_value is None:
        data_value = ""
    else:
        data_value = str(orig_data_value)
    has_whitespace = len(data_value.split()) > 1
    lines = data_value.splitlines()
    has_lines = len(lines) > 1
    has_single_quote = data_value.find("'") != -1
    has_double_quote = data_value.find('"') != -1
    too_long = len(data_value) + 2 > max_line_length
    if has_lines or (has_single_quote and has_double_quote) or too_long:
        noteol = _cif_write_semicolontextfield(f, lines, noteol, max_line_length)
        return noteol
    elif has_double_quote or (has_whitespace and not has_single_quote) or data_value == "":
        f.write("'"+data_value+"'")
        return True
    elif has_single_quote or (has_whitespace and not has_double_quote):
        f.write('"'+data_value+'"')
        return True
    elif not use_types:
        # Skip quotes if it looks like a number or is a simple string used in a loop
        if _cif_is_float(data_value):
            f.write(data_value)
            return True
        elif _cif_is_int(data_value):
            f.write(data_value)
            return True
        elif inloop and _cif_is_simplestring(data_value):
            f.write(data_value)
            return True
        else:
            f.write("'"+data_value+"'")
            return True
    else:
        # Always quote when a string, never quote otherwise
        if is_string(orig_data_value):
            f.write("'"+data_value+"'")
        else:
            f.write(data_value)
        return True


def _cif_write_data_block(f, data_block, max_line_length, use_types):
    for key in data_block:
        val = data_block[key]
        if key.startswith("loop_"):
            f.write("loop_\n")
            outdata_columns = []
            for unfiltered_column in val:
                column = _cif_validate_non_blank_char(unfiltered_column, "column name: "+unfiltered_column)
                f.write("_"+column+"\n")
                outdata_columns += [data_block[unfiltered_column]]
            if len(outdata_columns) > 0:
                noteol = False
                for i in range(len(outdata_columns[0])):
                    column_count = 0
                    for j in range(len(outdata_columns)):
                        column_count += len(str(outdata_columns[j][i]))+2
                        if column_count > max_line_length and noteol:
                            f.write("\n")
                            column_count = 0
                            noteol = False
                        noteol = _cif_write_data_value(f, outdata_columns[j][i], noteol, max_line_length, use_types, inloop=True)
                        if noteol:
                            f.write(" ")
                            column_count += 1
                        else:
                            column_count = 0
                    if noteol:
                        noteol = False
                        f.write("\n")
        elif basic.is_sequence(val):
            continue
        else:
            data_name = _cif_validate_name(key)
            # Do we have space _ + key + space + quote + the whole data value + quote?, if not, preemptively break line
            f.write("_"+data_name+" ")
            if len(data_name)+len(str(val))+4 > max_line_length:
                f.write("\n")
                noteol = False
            else:
                noteol = True
            noteol = _cif_write_data_value(f, val, noteol, max_line_length, use_types, inloop=False)
            if noteol:
                f.write("\n")
                noteol = False


[docs]def write_cif(ioa, data, header=None, max_line_length=80, use_types=False):
    """
    Generic cif writer, given a filename / ioadapter

    data = the cif data to write as an (ordered) dictionary of tag_name:value

    header = the header (comment) segment

    max_line_length = the maximum number of characters allowed on each line. This should not be set < 80
    (there is no point, and the length calculating algorithm breaks down at some small line length)

    use_types =

       if True: always quote values that are of string type. Numeric values are put in the file unquoted (as they should)
       if False (default): also strings that look like cif numbers are put in the file unquoted

    For loops, value is another dictionary with format column_name:value

    The optional parameter pragmatic regulates handling of some counter-intuitive aspects of the cif specification, where
    the default pragmatic=True handles these features the way people usually use them, whereas pragmatic=False means
    to read the cif file precisely according to the spec. For example, in a multiline text field::

      ;
      some text
      ;

    Means the string '\\nsome text'. For this specific case pragmatic=True removes the leading newline.

    set use_types to True to convert things that look like floats and integers to those respective types


    """

    ioa = IoAdapterFileWriter.use(ioa)
    f = ioa.file

    if header is not None:
        lines = header.splitlines()
        for line in lines:
            if len(line) > max_line_length:
                header = "#\n" + header
                break
        for line in lines:
            if len(line) > max_line_length:
                sublines = [line[i:i+79] for i in range(0, len(line), 79)]
                for subline in sublines:
                    f.write(subline+"\\"+"\n")
            else:
                f.write(line+"\n")

    data_block_count = -1
    for data_block in data:
        data_block_count += 1
        data_block_name_unfiltered = data_block[0]
        if data_block_name_unfiltered is None:
            data_block_name = "data_"+str(data_block_count)
        else:
            data_block_name = _cif_validate_name(data_block_name_unfiltered, "data block name")
            if data_block_name == "":
                data_block_name = "data_"+str(data_block_count)

        f.write("data_"+data_block_name+"\n")
        _cif_write_data_block(f, data_block[1], max_line_length, use_types)
    ioa.close()


[docs]def main():
    #gurk = open("/tmp/gurk.cif", "r")
    #datalist, header = read_cif(gurk)

    #gurk = open("/tmp/gurk2.cif", "w")
    #write_cif(gurk, datalist, header)
    #gurk.close()

    #datalist2, header2 = read_cif("/tmp/gurk2.cif")

    #print("MATCH1", header == header2)
    #print("MATCH2", datalist == datalist2)

    exit(0)

if __name__ == "__main__":
    main()
Source code for httk.httkio.cif

Navigation

Python API