Martin@1
|
1 |
#! /usr/bin/env python
|
Martin@1
|
2 |
|
Martin@272
|
3 |
# Read pair-wise alignments in MAF or LAST tabular format: write an
|
Martin@272
|
4 |
# "Oxford grid", a.k.a. dotplot.
|
Martin@1
|
5 |
|
Martin@1
|
6 |
# TODO: Currently, pixels with zero aligned nt-pairs are white, and
|
Martin@1
|
7 |
# pixels with one or more aligned nt-pairs are black. This can look
|
Martin@1
|
8 |
# too crowded for large genome alignments. I tried shading each pixel
|
Martin@1
|
9 |
# according to the number of aligned nt-pairs within it, but the
|
Martin@1
|
10 |
# result is too faint. How can this be done better?
|
Martin@1
|
11 |
|
Martin@482
|
12 |
import fileinput, itertools, optparse, os, re, sys
|
Martin@475
|
13 |
|
Martin@475
|
14 |
# Try to make PIL/PILLOW work:
|
Martin@475
|
15 |
try: from PIL import Image, ImageDraw, ImageFont, ImageColor
|
Martin@475
|
16 |
except ImportError: import Image, ImageDraw, ImageFont, ImageColor
|
Martin@1
|
17 |
|
Martin@644
|
18 |
def warn(message):
|
Martin@644
|
19 |
prog = os.path.basename(sys.argv[0])
|
Martin@644
|
20 |
sys.stderr.write(prog + ": " + message + "\n")
|
Martin@644
|
21 |
|
Martin@482
|
22 |
def tabBlocks(beg1, beg2, blocks):
|
Martin@482
|
23 |
'''Get the gapless blocks of an alignment, from LAST tabular format.'''
|
Martin@482
|
24 |
for i in blocks.split(","):
|
Martin@482
|
25 |
if ":" in i:
|
Martin@482
|
26 |
x, y = i.split(":")
|
Martin@482
|
27 |
beg1 += int(x)
|
Martin@482
|
28 |
beg2 += int(y)
|
Martin@482
|
29 |
else:
|
Martin@482
|
30 |
size = int(i)
|
Martin@482
|
31 |
yield beg1, beg2, size
|
Martin@482
|
32 |
beg1 += size
|
Martin@482
|
33 |
beg2 += size
|
Martin@272
|
34 |
|
Martin@482
|
35 |
def mafBlocks(beg1, beg2, seq1, seq2):
|
Martin@482
|
36 |
'''Get the gapless blocks of an alignment, from MAF format.'''
|
Martin@482
|
37 |
size = 0
|
Martin@482
|
38 |
for x, y in itertools.izip(seq1, seq2):
|
Martin@482
|
39 |
if x == "-":
|
Martin@482
|
40 |
if size:
|
Martin@482
|
41 |
yield beg1, beg2, size
|
Martin@482
|
42 |
beg1 += size
|
Martin@482
|
43 |
beg2 += size
|
Martin@482
|
44 |
size = 0
|
Martin@482
|
45 |
beg2 += 1
|
Martin@482
|
46 |
elif y == "-":
|
Martin@482
|
47 |
if size:
|
Martin@482
|
48 |
yield beg1, beg2, size
|
Martin@482
|
49 |
beg1 += size
|
Martin@482
|
50 |
beg2 += size
|
Martin@482
|
51 |
size = 0
|
Martin@482
|
52 |
beg1 += 1
|
Martin@272
|
53 |
else:
|
Martin@482
|
54 |
size += 1
|
Martin@482
|
55 |
if size: yield beg1, beg2, size
|
Martin@272
|
56 |
|
Martin@482
|
57 |
def alignmentInput(lines):
|
Martin@482
|
58 |
'''Get alignments and sequence lengths, from MAF or tabular format.'''
|
Martin@482
|
59 |
mafCount = 0
|
Martin@272
|
60 |
for line in lines:
|
Martin@272
|
61 |
w = line.split()
|
Martin@272
|
62 |
if line[0].isdigit(): # tabular format
|
Martin@482
|
63 |
chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
|
Martin@482
|
64 |
if w[4] == "-": beg1 -= seqlen1
|
Martin@482
|
65 |
chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
|
Martin@482
|
66 |
if w[9] == "-": beg2 -= seqlen2
|
Martin@482
|
67 |
blocks = tabBlocks(beg1, beg2, w[11])
|
Martin@482
|
68 |
yield chr1, seqlen1, chr2, seqlen2, blocks
|
Martin@272
|
69 |
elif line[0] == "s": # MAF format
|
Martin@482
|
70 |
if mafCount == 0:
|
Martin@482
|
71 |
chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
|
Martin@482
|
72 |
if w[4] == "-": beg1 -= seqlen1
|
Martin@482
|
73 |
mafCount = 1
|
Martin@482
|
74 |
else:
|
Martin@482
|
75 |
chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
|
Martin@482
|
76 |
if w[4] == "-": beg2 -= seqlen2
|
Martin@482
|
77 |
blocks = mafBlocks(beg1, beg2, seq1, seq2)
|
Martin@482
|
78 |
yield chr1, seqlen1, chr2, seqlen2, blocks
|
Martin@482
|
79 |
mafCount = 0
|
Martin@272
|
80 |
|
Martin@482
|
81 |
def readAlignments(lines):
|
Martin@482
|
82 |
'''Get alignments and sequence lengths, from MAF or tabular format.'''
|
Martin@482
|
83 |
alignments = []
|
Martin@482
|
84 |
seqLengths1 = {}
|
Martin@482
|
85 |
seqLengths2 = {}
|
Martin@482
|
86 |
for chr1, seqlen1, chr2, seqlen2, blocks in alignmentInput(lines):
|
Martin@482
|
87 |
aln = chr1, chr2, blocks
|
Martin@482
|
88 |
alignments.append(aln)
|
Martin@482
|
89 |
seqLengths1[chr1] = seqlen1
|
Martin@482
|
90 |
seqLengths2[chr2] = seqlen2
|
Martin@482
|
91 |
return alignments, seqLengths1, seqLengths2
|
Martin@1
|
92 |
|
Martin@1
|
93 |
def natural_sort_key(my_string):
|
Martin@1
|
94 |
'''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
|
Martin@1
|
95 |
parts = re.split(r'(\d+)', my_string)
|
Martin@1
|
96 |
parts[1::2] = map(int, parts[1::2])
|
Martin@1
|
97 |
return parts
|
Martin@1
|
98 |
|
Martin@1
|
99 |
def get_text_sizes(my_strings):
|
Martin@1
|
100 |
'''Get widths & heights, in pixels, of some strings.'''
|
Martin@95
|
101 |
if opts.fontsize == 0: return [(0, 0) for i in my_strings]
|
Martin@1
|
102 |
image_size = 1, 1
|
Martin@134
|
103 |
im = Image.new(image_mode, image_size)
|
Martin@1
|
104 |
draw = ImageDraw.Draw(im)
|
Martin@1
|
105 |
return [draw.textsize(i, font=font) for i in my_strings]
|
Martin@1
|
106 |
|
Martin@1
|
107 |
def get_seq_info(seq_size_dic):
|
Martin@1
|
108 |
'''Return miscellaneous information about the sequences.'''
|
Martin@1
|
109 |
seq_names = seq_size_dic.keys()
|
Martin@1
|
110 |
seq_names.sort(key=natural_sort_key)
|
Martin@1
|
111 |
seq_sizes = [seq_size_dic[i] for i in seq_names]
|
Martin@1
|
112 |
name_sizes = get_text_sizes(seq_names)
|
Martin@28
|
113 |
margin = max(zip(*name_sizes)[1]) # maximum text height
|
Martin@1
|
114 |
return seq_names, seq_sizes, name_sizes, margin
|
Martin@1
|
115 |
|
Martin@1
|
116 |
def div_ceil(x, y):
|
Martin@1
|
117 |
'''Return x / y rounded up.'''
|
Martin@1
|
118 |
q, r = divmod(x, y)
|
Martin@1
|
119 |
return q + (r != 0)
|
Martin@1
|
120 |
|
Martin@1
|
121 |
def tot_seq_pix(seq_sizes, bp_per_pix):
|
Martin@1
|
122 |
'''Return the total pixels needed for sequences of the given sizes.'''
|
Martin@28
|
123 |
return sum([div_ceil(i, bp_per_pix) for i in seq_sizes])
|
Martin@1
|
124 |
|
Martin@1
|
125 |
def get_bp_per_pix(seq_sizes, pix_limit):
|
Martin@1
|
126 |
'''Get the minimum bp-per-pixel that fits in the size limit.'''
|
Martin@1
|
127 |
seq_num = len(seq_sizes)
|
Martin@1
|
128 |
seq_pix_limit = pix_limit - pix_tween_seqs * (seq_num - 1)
|
Martin@1
|
129 |
if seq_pix_limit < seq_num:
|
Martin@1
|
130 |
sys.exit(my_name + ": can't fit the image: too many sequences?")
|
Martin@51
|
131 |
lower_bound = div_ceil(sum(seq_sizes), seq_pix_limit)
|
Martin@1
|
132 |
for bp_per_pix in itertools.count(lower_bound): # slow linear search
|
Martin@1
|
133 |
if tot_seq_pix(seq_sizes, bp_per_pix) <= seq_pix_limit: break
|
Martin@1
|
134 |
return bp_per_pix
|
Martin@1
|
135 |
|
Martin@1
|
136 |
def get_seq_starts(seq_pix, pix_tween_seqs, margin):
|
Martin@1
|
137 |
'''Get the start pixel for each sequence.'''
|
Martin@1
|
138 |
seq_starts = []
|
Martin@1
|
139 |
pix_tot = margin - pix_tween_seqs
|
Martin@1
|
140 |
for i in seq_pix:
|
Martin@1
|
141 |
pix_tot += pix_tween_seqs
|
Martin@1
|
142 |
seq_starts.append(pix_tot)
|
Martin@1
|
143 |
pix_tot += i
|
Martin@1
|
144 |
return seq_starts
|
Martin@1
|
145 |
|
Martin@1
|
146 |
def get_pix_info(seq_sizes, margin):
|
Martin@1
|
147 |
'''Return pixel information about the sequences.'''
|
Martin@1
|
148 |
seq_pix = [div_ceil(i, bp_per_pix) for i in seq_sizes]
|
Martin@1
|
149 |
seq_starts = get_seq_starts(seq_pix, pix_tween_seqs, margin)
|
Martin@1
|
150 |
tot_pix = seq_starts[-1] + seq_pix[-1]
|
Martin@1
|
151 |
return seq_pix, seq_starts, tot_pix
|
Martin@1
|
152 |
|
Martin@639
|
153 |
def drawLineForward(hits, width, bp_per_pix, origin, beg1, beg2, size):
|
Martin@639
|
154 |
while True:
|
Martin@639
|
155 |
q1, r1 = divmod(beg1, bp_per_pix)
|
Martin@639
|
156 |
q2, r2 = divmod(beg2, bp_per_pix)
|
Martin@639
|
157 |
hits[origin + q2 * width + q1] |= 1
|
Martin@639
|
158 |
next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
|
Martin@639
|
159 |
if next_pix >= size: break
|
Martin@639
|
160 |
beg1 += next_pix
|
Martin@639
|
161 |
beg2 += next_pix
|
Martin@639
|
162 |
size -= next_pix
|
Martin@639
|
163 |
|
Martin@639
|
164 |
def drawLineReverse(hits, width, bp_per_pix, origin, beg1, beg2, size):
|
Martin@639
|
165 |
beg2 = -1 - beg2
|
Martin@639
|
166 |
while True:
|
Martin@639
|
167 |
q1, r1 = divmod(beg1, bp_per_pix)
|
Martin@639
|
168 |
q2, r2 = divmod(beg2, bp_per_pix)
|
Martin@639
|
169 |
hits[origin + q2 * width + q1] |= 2
|
Martin@639
|
170 |
next_pix = min(bp_per_pix - r1, r2 + 1)
|
Martin@639
|
171 |
if next_pix >= size: break
|
Martin@639
|
172 |
beg1 += next_pix
|
Martin@639
|
173 |
beg2 -= next_pix
|
Martin@639
|
174 |
size -= next_pix
|
Martin@639
|
175 |
|
Martin@640
|
176 |
def alignmentPixels(width, height, alignments, bp_per_pix,
|
Martin@640
|
177 |
seq_start_dic1, seq_start_dic2):
|
Martin@640
|
178 |
hits = [0] * (width * height) # the image data
|
Martin@640
|
179 |
for seq1, seq2, blocks in alignments:
|
Martin@640
|
180 |
seq_start1 = seq_start_dic1[seq1]
|
Martin@640
|
181 |
seq_start2 = seq_start_dic2[seq2]
|
Martin@640
|
182 |
origin = seq_start2 * width + seq_start1
|
Martin@640
|
183 |
for beg1, beg2, size in blocks:
|
Martin@640
|
184 |
if beg1 < 0:
|
Martin@640
|
185 |
beg1 = -(beg1 + size)
|
Martin@640
|
186 |
beg2 = -(beg2 + size)
|
Martin@640
|
187 |
if beg2 >= 0:
|
Martin@640
|
188 |
drawLineForward(hits, width, bp_per_pix, origin,
|
Martin@640
|
189 |
beg1, beg2, size)
|
Martin@640
|
190 |
else:
|
Martin@640
|
191 |
drawLineReverse(hits, width, bp_per_pix, origin,
|
Martin@640
|
192 |
beg1, beg2, size)
|
Martin@640
|
193 |
return hits
|
Martin@1
|
194 |
|
Martin@1
|
195 |
def make_label(text, text_size, range_start, range_size):
|
Martin@1
|
196 |
'''Return an axis label with endpoint & sort-order information.'''
|
Martin@1
|
197 |
text_width = text_size[0]
|
Martin@1
|
198 |
label_start = range_start + (range_size - text_width) // 2
|
Martin@1
|
199 |
label_end = label_start + text_width
|
Martin@1
|
200 |
sort_key = text_width - range_size
|
Martin@1
|
201 |
return sort_key, label_start, label_end, text
|
Martin@1
|
202 |
|
Martin@1
|
203 |
def get_nonoverlapping_labels(labels):
|
Martin@1
|
204 |
'''Get a subset of non-overlapping axis labels, greedily.'''
|
Martin@1
|
205 |
nonoverlapping_labels = []
|
Martin@1
|
206 |
for i in labels:
|
Martin@28
|
207 |
if True not in [i[1] < j[2] + label_space and j[1] < i[2] + label_space
|
Martin@28
|
208 |
for j in nonoverlapping_labels]:
|
Martin@1
|
209 |
nonoverlapping_labels.append(i)
|
Martin@1
|
210 |
return nonoverlapping_labels
|
Martin@1
|
211 |
|
Martin@1
|
212 |
def get_axis_image(seq_names, name_sizes, seq_starts, seq_pix):
|
Martin@1
|
213 |
'''Make an image of axis labels.'''
|
Martin@1
|
214 |
min_pos = seq_starts[0]
|
Martin@1
|
215 |
max_pos = seq_starts[-1] + seq_pix[-1]
|
Martin@28
|
216 |
height = max(zip(*name_sizes)[1])
|
Martin@1
|
217 |
labels = [make_label(i, j, k, l) for i, j, k, l in
|
Martin@1
|
218 |
zip(seq_names, name_sizes, seq_starts, seq_pix)]
|
Martin@1
|
219 |
labels = [i for i in labels if i[1] >= min_pos and i[2] <= max_pos]
|
Martin@1
|
220 |
labels.sort()
|
Martin@1
|
221 |
labels = get_nonoverlapping_labels(labels)
|
Martin@1
|
222 |
image_size = max_pos, height
|
Martin@134
|
223 |
im = Image.new(image_mode, image_size, border_shade)
|
Martin@1
|
224 |
draw = ImageDraw.Draw(im)
|
Martin@1
|
225 |
for i in labels:
|
Martin@1
|
226 |
position = i[1], 0
|
Martin@134
|
227 |
draw.text(position, i[3], font=font, fill=text_color)
|
Martin@1
|
228 |
return im
|
Martin@1
|
229 |
|
Martin@643
|
230 |
if __name__ == "__main__":
|
Martin@643
|
231 |
my_name = os.path.basename(sys.argv[0])
|
Martin@643
|
232 |
usage = """
|
Martin@641
|
233 |
%prog --help
|
Martin@641
|
234 |
%prog [options] last-tabular-output dotplot.png
|
Martin@641
|
235 |
%prog [options] last-tabular-output dotplot.gif
|
Martin@641
|
236 |
etc."""
|
Martin@643
|
237 |
op = optparse.OptionParser(usage=usage)
|
Martin@643
|
238 |
# Replace "width" & "height" with a single "length" option?
|
Martin@643
|
239 |
op.add_option("-x", "--width", type="int", default=1000,
|
Martin@643
|
240 |
help="maximum width in pixels (default: %default)")
|
Martin@643
|
241 |
op.add_option("-y", "--height", type="int", default=1000,
|
Martin@643
|
242 |
help="maximum height in pixels (default: %default)")
|
Martin@643
|
243 |
op.add_option("-f", "--fontfile",
|
Martin@643
|
244 |
help="TrueType or OpenType font file")
|
Martin@643
|
245 |
op.add_option("-s", "--fontsize", type="int", default=11,
|
Martin@643
|
246 |
help="TrueType or OpenType font size (default: %default)")
|
Martin@643
|
247 |
op.add_option("-c", "--forwardcolor", default="red",
|
Martin@643
|
248 |
help="Color for forward alignments (default: %default)")
|
Martin@643
|
249 |
op.add_option("-r", "--reversecolor", default="blue",
|
Martin@643
|
250 |
help="Color for reverse alignments (default: %default)")
|
Martin@643
|
251 |
(opts, args) = op.parse_args()
|
Martin@643
|
252 |
if len(args) != 2: op.error("2 arguments needed")
|
Martin@641
|
253 |
|
Martin@643
|
254 |
if opts.fontfile: font = ImageFont.truetype(opts.fontfile, opts.fontsize)
|
Martin@643
|
255 |
else: font = ImageFont.load_default()
|
Martin@641
|
256 |
|
Martin@643
|
257 |
# Make these options too?
|
Martin@643
|
258 |
text_color = "black"
|
Martin@643
|
259 |
background_color = "white"
|
Martin@643
|
260 |
pix_tween_seqs = 2 # number of border pixels between sequences
|
Martin@643
|
261 |
border_shade = 239, 239, 239 # the shade of grey for border pixels
|
Martin@643
|
262 |
label_space = 5 # minimum number of pixels between axis labels
|
Martin@641
|
263 |
|
Martin@643
|
264 |
image_mode = 'RGB'
|
Martin@643
|
265 |
forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
|
Martin@643
|
266 |
reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
|
Martin@643
|
267 |
zipped_colors = zip(forward_color, reverse_color)
|
Martin@643
|
268 |
overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
|
Martin@641
|
269 |
|
Martin@644
|
270 |
warn("reading alignments...")
|
Martin@643
|
271 |
input = fileinput.input(args[0])
|
Martin@643
|
272 |
alignments, seq_size_dic1, seq_size_dic2 = readAlignments(input)
|
Martin@644
|
273 |
warn("done")
|
Martin@641
|
274 |
|
Martin@643
|
275 |
if not alignments:
|
Martin@643
|
276 |
sys.exit(my_name + ": there are no alignments")
|
Martin@641
|
277 |
|
Martin@643
|
278 |
seq_names1, seq_sizes1, name_sizes1, margin1 = get_seq_info(seq_size_dic1)
|
Martin@643
|
279 |
seq_names2, seq_sizes2, name_sizes2, margin2 = get_seq_info(seq_size_dic2)
|
Martin@641
|
280 |
|
Martin@644
|
281 |
warn("choosing bp per pixel...")
|
Martin@643
|
282 |
bp_per_pix1 = get_bp_per_pix(seq_sizes1, opts.width - margin1)
|
Martin@643
|
283 |
bp_per_pix2 = get_bp_per_pix(seq_sizes2, opts.height - margin2)
|
Martin@643
|
284 |
bp_per_pix = max(bp_per_pix1, bp_per_pix2)
|
Martin@644
|
285 |
warn("bp per pixel = " + str(bp_per_pix))
|
Martin@641
|
286 |
|
Martin@643
|
287 |
seq_pix1, seq_starts1, width = get_pix_info(seq_sizes1, margin1)
|
Martin@643
|
288 |
seq_pix2, seq_starts2, height = get_pix_info(seq_sizes2, margin2)
|
Martin@643
|
289 |
seq_start_dic1 = dict(zip(seq_names1, seq_starts1))
|
Martin@643
|
290 |
seq_start_dic2 = dict(zip(seq_names2, seq_starts2))
|
Martin@641
|
291 |
|
Martin@644
|
292 |
warn("processing alignments...")
|
Martin@643
|
293 |
hits = alignmentPixels(width, height, alignments, bp_per_pix,
|
Martin@643
|
294 |
seq_start_dic1, seq_start_dic2)
|
Martin@644
|
295 |
warn("done")
|
Martin@641
|
296 |
|
Martin@643
|
297 |
image_size = width, height
|
Martin@643
|
298 |
im = Image.new(image_mode, image_size, background_color)
|
Martin@134
|
299 |
|
Martin@643
|
300 |
for i in range(height):
|
Martin@643
|
301 |
for j in range(width):
|
Martin@643
|
302 |
store_value = hits[i * width + j]
|
Martin@643
|
303 |
xy = j, i
|
Martin@643
|
304 |
if store_value == 1: im.putpixel(xy, forward_color)
|
Martin@643
|
305 |
elif store_value == 2: im.putpixel(xy, reverse_color)
|
Martin@643
|
306 |
elif store_value == 3: im.putpixel(xy, overlap_color)
|
Martin@95
|
307 |
|
Martin@643
|
308 |
if opts.fontsize != 0:
|
Martin@643
|
309 |
axis1 = get_axis_image(seq_names1, name_sizes1, seq_starts1, seq_pix1)
|
Martin@643
|
310 |
axis2 = get_axis_image(seq_names2, name_sizes2, seq_starts2, seq_pix2)
|
Martin@643
|
311 |
axis2 = axis2.rotate(270)
|
Martin@643
|
312 |
im.paste(axis1, (0, 0))
|
Martin@643
|
313 |
im.paste(axis2, (0, 0))
|
Martin@1
|
314 |
|
Martin@643
|
315 |
for i in seq_starts1[1:]:
|
Martin@643
|
316 |
box = i - pix_tween_seqs, margin2, i, height
|
Martin@643
|
317 |
im.paste(border_shade, box)
|
Martin@1
|
318 |
|
Martin@643
|
319 |
for i in seq_starts2[1:]:
|
Martin@643
|
320 |
box = margin1, i - pix_tween_seqs, width, i
|
Martin@643
|
321 |
im.paste(border_shade, box)
|
Martin@1
|
322 |
|
Martin@643
|
323 |
im.save(args[1])
|