Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix lzop_build_offset_table.py and maf_build_index.py scripts under Python 3 #70

Merged
merged 1 commit into from
Sep 27, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions lib/bx/align/epo.py
Original file line number Diff line number Diff line change
@@ -169,20 +169,23 @@ def _parse_file(cls, path, pickle=False):

if fname.endswith('.pkl'):
# you asked for the pickled file. I'll give it to you
log.debug("loading pickled file %s ..." % fname)
log.debug("loading pickled file %s ...", fname)
with open(fname, "rb") as f:
return cPickle.load(f)
elif os.path.isfile("%s.pkl" % fname):
# there is a cached version I can give to you
log.info("loading pickled file %s.pkl ..." % fname)
log.info("loading pickled file %s.pkl ...", fname)
if os.stat(path).st_mtime > os.stat("%s.pkl" % fname).st_mtime:
log.critical("*** pickled file %s.pkl is not up to date ***" % (path))
with open("%s.pkl" % fname, "rb") as f:
return cPickle.load(f)
log.critical("*** pickled file %s.pkl is not up to date ***", fname)
try:
with open("%s.pkl" % fname, "rb") as f:
return cPickle.load(f)
except Exception:
log.warning("Loading pickled file %s.pkl failed", fname)

data = fastLoadChain(path, cls._strfactory)
if pickle and not os.path.isfile('%s.pkl' % fname):
log.info("pckling to %s.pkl" % (fname))
log.info("pickling to %s.pkl", fname)
with open('%s.pkl' % fname, 'wb') as f:
cPickle.dump(data, f)
return data
@@ -233,7 +236,7 @@ def _parse_epo(cls, fname):
for el in (cls._strfactory(_) for _ in fd):
if el:
data.setdefault(el.gabid, []).append(el)
log.info("parsed %d elements from %s" % (len(data), fname))
log.info("parsed %d elements from %s", len(data), fname)
return data

def cigar_iter(self, reverse):
14 changes: 14 additions & 0 deletions lib/bx/misc/seekbzip2.py
Original file line number Diff line number Diff line change
@@ -21,6 +21,7 @@ def __init__(self, filename, table_filename, **kwargs):
self.init_bz2()
self.pos = 0
self.dirty = True
self.closed = False

def init_bz2(self):
self.seek_bz2 = SeekBzip2(self.filename)
@@ -47,6 +48,7 @@ def init_table(self):

def close(self):
self.seek_bz2.close()
self.closed = True

def fix_dirty(self):
# Our virtual position in the uncompressed data is out of sync
@@ -141,8 +143,20 @@ def __next__(self):
def __iter__(self):
return self

def flush(self):
pass

def readable(self):
return True

def readlines(self, sizehint=-1):
return [ln for ln in self]

def seekable(self):
return True

def xreadlines(self):
return iter(self)

def writable(self):
return False
46 changes: 46 additions & 0 deletions lib/bx/misc/seeklzop.py
Original file line number Diff line number Diff line change
@@ -29,6 +29,7 @@ def __init__(self, filename, table_filename, block_cache_size=0, **kwargs):
self.init_table()
self.file = open(self.filename, "rb")
self.dirty = True
self.closed = False
self.at_eof = False
self.file_pos = 0
self.current_block_index = -1
@@ -55,6 +56,7 @@ def init_table(self):

def close(self):
self.file.close()
self.closed = True

def load_block(self, index):
if self.cache is not None and index in self.cache:
@@ -110,6 +112,37 @@ def seek(self, offset, whence=0):
def tell(self):
return self.file_pos

def read(self, sizehint=-1):
if sizehint < 0:
chunks = []
while True:
val = self._read(1024*1024)
if val:
chunks.append(val)
else:
break
return b"".join(chunks)
else:
return self._read(sizehint)

def _read(self, size):
if self.dirty:
self.fix_dirty()
val = b''
while size:
part = self.current_block.read(size)
size -= len(part)
if part:
val += part
elif self.current_block_index == self.nblocks - 1:
self.at_eof = True
break
else:
self.current_block_index += 1
self.current_block = BytesIO(self.load_block(self.current_block_index))
self.file_pos += len(val)
return val

def readline(self):
if self.dirty:
self.fix_dirty()
@@ -138,6 +171,19 @@ def __next__(self):
def __iter__(self):
return self

def flush(self):
pass

def readable(self):
return True

def seekable(self):
return True

def writable(self):
return False


# --- Factor out ---


2 changes: 1 addition & 1 deletion script_tests/base/__init__.py
Original file line number Diff line number Diff line change
@@ -66,7 +66,7 @@ def test_script(self):
all_fnames[key] = input_fnames[key]
if key == 'stdin':
stdin = open(input_fnames[key], 'r')
for key, value in output_files.items():
for key in output_files.keys():
_, tf_name = tempfile.mkstemp()
output_fnames[key] = tf_name
all_fnames[key] = output_fnames[key]
9 changes: 9 additions & 0 deletions script_tests/lzop_build_offset_table_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import unittest

import base


class Test(base.BaseScriptTest, unittest.TestCase):
command_line = "./scripts/lzop_build_offset_table.py"
input_stdin = base.TestFile(filename="./test_data/maf_tests/mm8_chr7_tiny.maf.lzo")
output_stdout = base.TestFile(filename="./test_data/maf_tests/mm8_chr7_tiny.maf.lzot")
21 changes: 21 additions & 0 deletions script_tests/maf_build_index_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import unittest

import base


class Test1(base.BaseScriptTest, unittest.TestCase):
command_line = "./scripts/maf_build_index.py ${maf} ${maf_index}"
input_maf = base.TestFile(filename="./test_data/maf_tests/mm8_chr7_tiny.maf")
output_maf_index = base.TestFile(filename="./test_data/maf_tests/mm8_chr7_tiny.maf.index")


class Test2(base.BaseScriptTest, unittest.TestCase):
command_line = "./scripts/maf_build_index.py ${maf_bz2} ${maf_index}"
input_maf_bz2 = base.TestFile(filename="./test_data/maf_tests/mm8_chr7_tiny.maf.bz2")
output_maf_index = base.TestFile(filename="./test_data/maf_tests/mm8_chr7_tiny.maf.index")


class Test3(base.BaseScriptTest, unittest.TestCase):
command_line = "./scripts/maf_build_index.py ${maf_lzo} ${maf_index}"
input_maf_lzo = base.TestFile(filename="./test_data/maf_tests/mm8_chr7_tiny.maf.lzo")
output_maf_index = base.TestFile(filename="./test_data/maf_tests/mm8_chr7_tiny.maf.index")
4 changes: 2 additions & 2 deletions script_tests/maf_extract_ranges_indexed_tests.py
Original file line number Diff line number Diff line change
@@ -4,6 +4,6 @@


class Test(base.BaseScriptTest, unittest.TestCase):
command_line = "./scripts/maf_extract_ranges_indexed.py ./test_data/maf_tests/mm8_chr7_tiny.maf -c -m 5 -p mm8. < ${bed}"
input_bed = base.TestFile(filename="./test_data/maf_tests/dcking_ghp074.bed")
command_line = "./scripts/maf_extract_ranges_indexed.py ./test_data/maf_tests/mm8_chr7_tiny.maf -c -m 5 -p mm8."
input_stdin = base.TestFile(filename="./test_data/maf_tests/dcking_ghp074.bed")
output_stdout = base.TestFile(filename="./test_data/maf_tests/dcking_ghp074.maf")
2 changes: 1 addition & 1 deletion scripts/bed_build_windows.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python

"""
Build windows of length `window_size` over the sequences defined by
2 changes: 1 addition & 1 deletion scripts/bed_complement.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python

"""
Complement the regions of a bed file. Requires a file that maps source names
2 changes: 1 addition & 1 deletion scripts/bed_count_by_interval.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python
"""
For each interval in `bed1` count the number of intersecting regions in `bed2`.

2 changes: 1 addition & 1 deletion scripts/bed_count_overlapping.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python
"""
For each interval in `bed1` count the number of intersecting regions in `bed2`.

2 changes: 1 addition & 1 deletion scripts/bed_rand_intersect.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python

"""
From a set of regions and two sets of intervals inside those regions
2 changes: 1 addition & 1 deletion scripts/div_snp_table_chr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python

"""
FIXME!
2 changes: 1 addition & 1 deletion scripts/interval_join.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python

"""
Match up intersecting intervals from two files. This performs a "full join",
11 changes: 8 additions & 3 deletions scripts/lzop_build_offset_table.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python

"""
Read a compressed file as created by 'lzop' from stdin and write a table to
@@ -12,7 +12,7 @@
import struct
import sys

MAGIC = "\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a"
MAGIC = b"\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a"

F_ADLER32_D = 0x00000001
F_ADLER32_C = 0x00000002
@@ -41,7 +41,12 @@ def get(self, fmt):


def main():
f = UnpackWrapper(sys.stdin)
try:
binary_stdin = sys.stdin.buffer
except AttributeError:
# Python 2
binary_stdin = sys.stdin
f = UnpackWrapper(binary_stdin)
# Read header
magic = f.read(9)
assert magic == MAGIC, "Not LZOP file"
2 changes: 1 addition & 1 deletion scripts/mMK_bitset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python
from __future__ import print_function

from optparse import OptionParser
2 changes: 1 addition & 1 deletion scripts/maf_build_index.py
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@
from bx import interval_index_file
from bx.cookbook import doc_optparse
from bx.misc.seekbzip2 import SeekableBzip2File
from bx.misc.seeklzop import SeekableLzopFile


def main():
@@ -36,7 +37,6 @@ def main():
# Strip .bz2 from the filename before adding ".index"
maf_file = maf_file[:-4]
elif maf_file.endswith(".lzo"):
from bx.misc.seeklzop import SeekableLzopFile
table_file = maf_file + "t"
if not os.path.exists(table_file):
doc_optparse.exit("To index lzo compressed files first "
2 changes: 1 addition & 1 deletion scripts/maf_count.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""
Read a MAF from standard input and print counts of alignments, bases, or
2 changes: 1 addition & 1 deletion scripts/maf_covered_ranges.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""
usage: %prog species1,species2,... nrequired < maf
2 changes: 1 addition & 1 deletion scripts/maf_div_sites.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python
"""
Create a bed file listing all the divergent sites between two specific species
in a maf.
2 changes: 1 addition & 1 deletion scripts/maf_gc_content.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""
Read a MAF from standard input and print average GC content of each alignment
2 changes: 1 addition & 1 deletion scripts/maf_interval_alignibility.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python

"""
WARNING: bz2/bz2t support and file cache support are new and not as well
2 changes: 1 addition & 1 deletion scripts/maf_limit_to_species.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""
Read a maf file from stdin and write out a new maf with only blocks having all
2 changes: 1 addition & 1 deletion scripts/maf_mapping_word_frequency.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""

2 changes: 1 addition & 1 deletion scripts/maf_mean_length_ungapped_piece.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""
Read a MAF from standard input and determine the mean length of ungapped pieces
2 changes: 1 addition & 1 deletion scripts/maf_percent_columns_matching.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""
Read a PAIRWISE maf from stdin and print the fraction of columns whose bases
2 changes: 1 addition & 1 deletion scripts/maf_percent_identity.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""
Read a PAIRWISE maf from stdin and print the percent identity of each
2 changes: 1 addition & 1 deletion scripts/maf_print_chroms.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""
Read a maf from stdin and print the chromosome number for each alignment. It
2 changes: 1 addition & 1 deletion scripts/maf_species_in_all_files.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.4
#!/usr/bin/env python

"""
Takes a list of maf filenames on the command line and prints a comma separated
2 changes: 1 addition & 1 deletion scripts/maf_split_by_src.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""
Read a MAF from stdin and break into several mafs based on the source of
2 changes: 1 addition & 1 deletion scripts/maf_word_frequency.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""
Read a MAF and print counts and frequencies of all n-mers
2 changes: 1 addition & 1 deletion scripts/tfloc_summary.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python2.3
#!/usr/bin/env python

"""
Read TFLOC output from stdin and write out a summary in which the nth line
Binary file added test_data/maf_tests/mm8_chr7_tiny.maf.bz2
Binary file not shown.
1 change: 1 addition & 0 deletions test_data/maf_tests/mm8_chr7_tiny.maf.bz2t
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
32 10589
Binary file added test_data/maf_tests/mm8_chr7_tiny.maf.lzo
Binary file not shown.
2 changes: 2 additions & 0 deletions test_data/maf_tests/mm8_chr7_tiny.maf.lzot
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
s 10589
o 67 4452 10589