Martin@1
|
1 |
#! /usr/bin/env python
|
Martin@1
|
2 |
|
Martin@272
|
3 |
# Read pair-wise alignments in MAF or LAST tabular format: write an
|
Martin@272
|
4 |
# "Oxford grid", a.k.a. dotplot.
|
Martin@1
|
5 |
|
Martin@1
|
6 |
# TODO: Currently, pixels with zero aligned nt-pairs are white, and
|
Martin@1
|
7 |
# pixels with one or more aligned nt-pairs are black. This can look
|
Martin@1
|
8 |
# too crowded for large genome alignments. I tried shading each pixel
|
Martin@1
|
9 |
# according to the number of aligned nt-pairs within it, but the
|
Martin@1
|
10 |
# result is too faint. How can this be done better?
|
Martin@1
|
11 |
|
Martin@844
|
12 |
import fnmatch, itertools, optparse, os, re, sys
|
Martin@475
|
13 |
|
Martin@475
|
14 |
# Try to make PIL/PILLOW work:
|
Martin@475
|
15 |
try: from PIL import Image, ImageDraw, ImageFont, ImageColor
|
Martin@475
|
16 |
except ImportError: import Image, ImageDraw, ImageFont, ImageColor
|
Martin@1
|
17 |
|
Martin@844
|
18 |
def myOpen(fileName): # faster than fileinput
|
Martin@844
|
19 |
if fileName == "-":
|
Martin@844
|
20 |
return sys.stdin
|
Martin@844
|
21 |
return open(fileName)
|
Martin@844
|
22 |
|
Martin@644
|
23 |
def warn(message):
|
Martin@644
|
24 |
prog = os.path.basename(sys.argv[0])
|
Martin@644
|
25 |
sys.stderr.write(prog + ": " + message + "\n")
|
Martin@644
|
26 |
|
Martin@840
|
27 |
def croppedBlocks(blocks, range1, range2):
|
Martin@840
|
28 |
cropBeg1, cropEnd1 = range1
|
Martin@840
|
29 |
cropBeg2, cropEnd2 = range2
|
Martin@840
|
30 |
if blocks[0][0] < 0: cropBeg1, cropEnd1 = -cropEnd1, -cropBeg1
|
Martin@840
|
31 |
if blocks[0][1] < 0: cropBeg2, cropEnd2 = -cropEnd2, -cropBeg2
|
Martin@840
|
32 |
for beg1, beg2, size in blocks:
|
Martin@840
|
33 |
b1 = max(cropBeg1, beg1)
|
Martin@840
|
34 |
e1 = min(cropEnd1, beg1 + size)
|
Martin@840
|
35 |
if b1 >= e1: continue
|
Martin@840
|
36 |
offset = beg2 - beg1
|
Martin@840
|
37 |
b2 = max(cropBeg2, b1 + offset)
|
Martin@840
|
38 |
e2 = min(cropEnd2, e1 + offset)
|
Martin@840
|
39 |
if b2 >= e2: continue
|
Martin@840
|
40 |
yield b2 - offset, b2, e2 - b2
|
Martin@840
|
41 |
|
Martin@482
|
42 |
def tabBlocks(beg1, beg2, blocks):
|
Martin@482
|
43 |
'''Get the gapless blocks of an alignment, from LAST tabular format.'''
|
Martin@482
|
44 |
for i in blocks.split(","):
|
Martin@482
|
45 |
if ":" in i:
|
Martin@482
|
46 |
x, y = i.split(":")
|
Martin@482
|
47 |
beg1 += int(x)
|
Martin@482
|
48 |
beg2 += int(y)
|
Martin@482
|
49 |
else:
|
Martin@482
|
50 |
size = int(i)
|
Martin@482
|
51 |
yield beg1, beg2, size
|
Martin@482
|
52 |
beg1 += size
|
Martin@482
|
53 |
beg2 += size
|
Martin@272
|
54 |
|
Martin@482
|
55 |
def mafBlocks(beg1, beg2, seq1, seq2):
|
Martin@482
|
56 |
'''Get the gapless blocks of an alignment, from MAF format.'''
|
Martin@482
|
57 |
size = 0
|
Martin@482
|
58 |
for x, y in itertools.izip(seq1, seq2):
|
Martin@482
|
59 |
if x == "-":
|
Martin@482
|
60 |
if size:
|
Martin@482
|
61 |
yield beg1, beg2, size
|
Martin@482
|
62 |
beg1 += size
|
Martin@482
|
63 |
beg2 += size
|
Martin@482
|
64 |
size = 0
|
Martin@482
|
65 |
beg2 += 1
|
Martin@482
|
66 |
elif y == "-":
|
Martin@482
|
67 |
if size:
|
Martin@482
|
68 |
yield beg1, beg2, size
|
Martin@482
|
69 |
beg1 += size
|
Martin@482
|
70 |
beg2 += size
|
Martin@482
|
71 |
size = 0
|
Martin@482
|
72 |
beg1 += 1
|
Martin@272
|
73 |
else:
|
Martin@482
|
74 |
size += 1
|
Martin@482
|
75 |
if size: yield beg1, beg2, size
|
Martin@272
|
76 |
|
Martin@482
|
77 |
def alignmentInput(lines):
|
Martin@482
|
78 |
'''Get alignments and sequence lengths, from MAF or tabular format.'''
|
Martin@482
|
79 |
mafCount = 0
|
Martin@272
|
80 |
for line in lines:
|
Martin@272
|
81 |
w = line.split()
|
Martin@272
|
82 |
if line[0].isdigit(): # tabular format
|
Martin@482
|
83 |
chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
|
Martin@482
|
84 |
if w[4] == "-": beg1 -= seqlen1
|
Martin@482
|
85 |
chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
|
Martin@482
|
86 |
if w[9] == "-": beg2 -= seqlen2
|
Martin@847
|
87 |
blocks = tabBlocks(beg1, beg2, w[11])
|
Martin@482
|
88 |
yield chr1, seqlen1, chr2, seqlen2, blocks
|
Martin@272
|
89 |
elif line[0] == "s": # MAF format
|
Martin@482
|
90 |
if mafCount == 0:
|
Martin@482
|
91 |
chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
|
Martin@482
|
92 |
if w[4] == "-": beg1 -= seqlen1
|
Martin@482
|
93 |
mafCount = 1
|
Martin@482
|
94 |
else:
|
Martin@482
|
95 |
chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
|
Martin@482
|
96 |
if w[4] == "-": beg2 -= seqlen2
|
Martin@847
|
97 |
blocks = mafBlocks(beg1, beg2, seq1, seq2)
|
Martin@482
|
98 |
yield chr1, seqlen1, chr2, seqlen2, blocks
|
Martin@482
|
99 |
mafCount = 0
|
Martin@272
|
100 |
|
Martin@840
|
101 |
def seqRangeFromText(text):
|
Martin@840
|
102 |
if ":" in text:
|
Martin@840
|
103 |
pattern, interval = text.rsplit(":", 1)
|
Martin@840
|
104 |
if "-" in interval:
|
Martin@840
|
105 |
beg, end = interval.rsplit("-", 1)
|
Martin@840
|
106 |
return pattern, int(beg), int(end) # beg may be negative
|
Martin@840
|
107 |
return text, 0, sys.maxsize
|
Martin@840
|
108 |
|
Martin@840
|
109 |
def rangeFromSeqName(seqRanges, name, seqLen):
|
Martin@840
|
110 |
if not seqRanges: return 0, seqLen
|
Martin@651
|
111 |
base = name.split(".")[-1] # allow for names like hg19.chr7
|
Martin@840
|
112 |
for pat, beg, end in seqRanges:
|
Martin@840
|
113 |
if fnmatch.fnmatchcase(name, pat) or fnmatch.fnmatchcase(base, pat):
|
Martin@840
|
114 |
return max(beg, 0), min(end, seqLen)
|
Martin@844
|
115 |
return None
|
Martin@651
|
116 |
|
Martin@851
|
117 |
def updateSeqs(isTrim, seqNames, seqLimits, seqName, seqRange, blocks, index):
|
Martin@851
|
118 |
if seqName not in seqLimits:
|
Martin@851
|
119 |
seqNames.append(seqName)
|
Martin@839
|
120 |
if isTrim:
|
Martin@839
|
121 |
beg = blocks[0][index]
|
Martin@839
|
122 |
end = blocks[-1][index] + blocks[-1][2]
|
Martin@839
|
123 |
if beg < 0: beg, end = -end, -beg
|
Martin@839
|
124 |
if seqName in seqLimits:
|
Martin@839
|
125 |
b, e = seqLimits[seqName]
|
Martin@839
|
126 |
seqLimits[seqName] = min(b, beg), max(e, end)
|
Martin@839
|
127 |
else:
|
Martin@839
|
128 |
seqLimits[seqName] = beg, end
|
Martin@839
|
129 |
else:
|
Martin@840
|
130 |
seqLimits[seqName] = seqRange
|
Martin@839
|
131 |
|
Martin@651
|
132 |
def readAlignments(fileName, opts):
|
Martin@839
|
133 |
'''Get alignments and sequence limits, from MAF or tabular format.'''
|
Martin@840
|
134 |
seqRanges1 = map(seqRangeFromText, opts.seq1)
|
Martin@840
|
135 |
seqRanges2 = map(seqRangeFromText, opts.seq2)
|
Martin@840
|
136 |
|
Martin@482
|
137 |
alignments = []
|
Martin@851
|
138 |
seqNames1 = []
|
Martin@851
|
139 |
seqNames2 = []
|
Martin@839
|
140 |
seqLimits1 = {}
|
Martin@839
|
141 |
seqLimits2 = {}
|
Martin@844
|
142 |
lines = myOpen(fileName)
|
Martin@838
|
143 |
for seqName1, seqLen1, seqName2, seqLen2, blocks in alignmentInput(lines):
|
Martin@840
|
144 |
range1 = rangeFromSeqName(seqRanges1, seqName1, seqLen1)
|
Martin@844
|
145 |
if not range1: continue
|
Martin@840
|
146 |
range2 = rangeFromSeqName(seqRanges2, seqName2, seqLen2)
|
Martin@844
|
147 |
if not range2: continue
|
Martin@847
|
148 |
b = list(croppedBlocks(list(blocks), range1, range2))
|
Martin@840
|
149 |
if not b: continue
|
Martin@840
|
150 |
aln = seqName1, seqName2, b
|
Martin@482
|
151 |
alignments.append(aln)
|
Martin@851
|
152 |
updateSeqs(opts.trim1, seqNames1, seqLimits1, seqName1, range1, b, 0)
|
Martin@851
|
153 |
updateSeqs(opts.trim2, seqNames2, seqLimits2, seqName2, range2, b, 1)
|
Martin@851
|
154 |
return alignments, seqNames1, seqNames2, seqLimits1, seqLimits2
|
Martin@1
|
155 |
|
Martin@1
|
156 |
def natural_sort_key(my_string):
|
Martin@1
|
157 |
'''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
|
Martin@1
|
158 |
parts = re.split(r'(\d+)', my_string)
|
Martin@1
|
159 |
parts[1::2] = map(int, parts[1::2])
|
Martin@1
|
160 |
return parts
|
Martin@1
|
161 |
|
Martin@647
|
162 |
def get_text_sizes(my_strings, font, fontsize, image_mode):
|
Martin@1
|
163 |
'''Get widths & heights, in pixels, of some strings.'''
|
Martin@647
|
164 |
if fontsize == 0: return [(0, 0) for i in my_strings]
|
Martin@1
|
165 |
image_size = 1, 1
|
Martin@134
|
166 |
im = Image.new(image_mode, image_size)
|
Martin@1
|
167 |
draw = ImageDraw.Draw(im)
|
Martin@1
|
168 |
return [draw.textsize(i, font=font) for i in my_strings]
|
Martin@1
|
169 |
|
Martin@846
|
170 |
def sizeText(size):
|
Martin@846
|
171 |
suffixes = "bp", "kb", "Mb", "Gb"
|
Martin@846
|
172 |
for i, x in enumerate(suffixes):
|
Martin@846
|
173 |
j = 10 ** (i * 3)
|
Martin@846
|
174 |
if size < j * 10:
|
Martin@846
|
175 |
return "%.2g" % (1.0 * size / j) + x
|
Martin@846
|
176 |
if size < j * 1000 or i == len(suffixes) - 1:
|
Martin@846
|
177 |
return "%.0f" % (1.0 * size / j) + x
|
Martin@846
|
178 |
|
Martin@846
|
179 |
def seqNameAndSizeText(seqName, seqSize):
|
Martin@846
|
180 |
return seqName + ": " + sizeText(seqSize)
|
Martin@846
|
181 |
|
Martin@851
|
182 |
def getSeqInfo(sortOpt, seqNames, seqLimits,
|
Martin@851
|
183 |
font, fontsize, image_mode, isShowSize):
|
Martin@1
|
184 |
'''Return miscellaneous information about the sequences.'''
|
Martin@851
|
185 |
if sortOpt == 1:
|
Martin@851
|
186 |
seqNames.sort(key=natural_sort_key)
|
Martin@850
|
187 |
seqSizes = [seqLimits[i][1] - seqLimits[i][0] for i in seqNames]
|
Martin@851
|
188 |
if sortOpt == 2:
|
Martin@851
|
189 |
seqRecords = sorted(zip(seqSizes, seqNames), reverse=True)
|
Martin@851
|
190 |
seqSizes = [i[0] for i in seqRecords]
|
Martin@851
|
191 |
seqNames = [i[1] for i in seqRecords]
|
Martin@850
|
192 |
if isShowSize:
|
Martin@850
|
193 |
seqLabels = map(seqNameAndSizeText, seqNames, seqSizes)
|
Martin@850
|
194 |
else:
|
Martin@850
|
195 |
seqLabels = seqNames
|
Martin@846
|
196 |
labelSizes = get_text_sizes(seqLabels, font, fontsize, image_mode)
|
Martin@846
|
197 |
margin = max(zip(*labelSizes)[1]) # maximum text height
|
Martin@850
|
198 |
return seqNames, seqSizes, seqLabels, labelSizes, margin
|
Martin@1
|
199 |
|
Martin@1
|
200 |
def div_ceil(x, y):
|
Martin@1
|
201 |
'''Return x / y rounded up.'''
|
Martin@1
|
202 |
q, r = divmod(x, y)
|
Martin@1
|
203 |
return q + (r != 0)
|
Martin@1
|
204 |
|
Martin@1
|
205 |
def tot_seq_pix(seq_sizes, bp_per_pix):
|
Martin@1
|
206 |
'''Return the total pixels needed for sequences of the given sizes.'''
|
Martin@28
|
207 |
return sum([div_ceil(i, bp_per_pix) for i in seq_sizes])
|
Martin@1
|
208 |
|
Martin@645
|
209 |
def get_bp_per_pix(seq_sizes, pix_tween_seqs, pix_limit):
|
Martin@1
|
210 |
'''Get the minimum bp-per-pixel that fits in the size limit.'''
|
Martin@1
|
211 |
seq_num = len(seq_sizes)
|
Martin@1
|
212 |
seq_pix_limit = pix_limit - pix_tween_seqs * (seq_num - 1)
|
Martin@1
|
213 |
if seq_pix_limit < seq_num:
|
Martin@649
|
214 |
raise Exception("can't fit the image: too many sequences?")
|
Martin@51
|
215 |
lower_bound = div_ceil(sum(seq_sizes), seq_pix_limit)
|
Martin@1
|
216 |
for bp_per_pix in itertools.count(lower_bound): # slow linear search
|
Martin@1
|
217 |
if tot_seq_pix(seq_sizes, bp_per_pix) <= seq_pix_limit: break
|
Martin@1
|
218 |
return bp_per_pix
|
Martin@1
|
219 |
|
Martin@1
|
220 |
def get_seq_starts(seq_pix, pix_tween_seqs, margin):
|
Martin@1
|
221 |
'''Get the start pixel for each sequence.'''
|
Martin@1
|
222 |
seq_starts = []
|
Martin@1
|
223 |
pix_tot = margin - pix_tween_seqs
|
Martin@1
|
224 |
for i in seq_pix:
|
Martin@1
|
225 |
pix_tot += pix_tween_seqs
|
Martin@1
|
226 |
seq_starts.append(pix_tot)
|
Martin@1
|
227 |
pix_tot += i
|
Martin@1
|
228 |
return seq_starts
|
Martin@1
|
229 |
|
Martin@645
|
230 |
def get_pix_info(seq_sizes, bp_per_pix, pix_tween_seqs, margin):
|
Martin@1
|
231 |
'''Return pixel information about the sequences.'''
|
Martin@1
|
232 |
seq_pix = [div_ceil(i, bp_per_pix) for i in seq_sizes]
|
Martin@1
|
233 |
seq_starts = get_seq_starts(seq_pix, pix_tween_seqs, margin)
|
Martin@1
|
234 |
tot_pix = seq_starts[-1] + seq_pix[-1]
|
Martin@1
|
235 |
return seq_pix, seq_starts, tot_pix
|
Martin@1
|
236 |
|
Martin@835
|
237 |
def drawLineForward(hits, width, bp_per_pix, beg1, beg2, size):
|
Martin@639
|
238 |
while True:
|
Martin@639
|
239 |
q1, r1 = divmod(beg1, bp_per_pix)
|
Martin@639
|
240 |
q2, r2 = divmod(beg2, bp_per_pix)
|
Martin@835
|
241 |
hits[q2 * width + q1] |= 1
|
Martin@639
|
242 |
next_pix = min(bp_per_pix - r1, bp_per_pix - r2)
|
Martin@639
|
243 |
if next_pix >= size: break
|
Martin@639
|
244 |
beg1 += next_pix
|
Martin@639
|
245 |
beg2 += next_pix
|
Martin@639
|
246 |
size -= next_pix
|
Martin@639
|
247 |
|
Martin@835
|
248 |
def drawLineReverse(hits, width, bp_per_pix, beg1, beg2, size):
|
Martin@639
|
249 |
beg2 = -1 - beg2
|
Martin@639
|
250 |
while True:
|
Martin@639
|
251 |
q1, r1 = divmod(beg1, bp_per_pix)
|
Martin@639
|
252 |
q2, r2 = divmod(beg2, bp_per_pix)
|
Martin@835
|
253 |
hits[q2 * width + q1] |= 2
|
Martin@639
|
254 |
next_pix = min(bp_per_pix - r1, r2 + 1)
|
Martin@639
|
255 |
if next_pix >= size: break
|
Martin@639
|
256 |
beg1 += next_pix
|
Martin@639
|
257 |
beg2 -= next_pix
|
Martin@639
|
258 |
size -= next_pix
|
Martin@639
|
259 |
|
Martin@836
|
260 |
def alignmentPixels(width, height, alignments, bp_per_pix, origins1, origins2):
|
Martin@640
|
261 |
hits = [0] * (width * height) # the image data
|
Martin@640
|
262 |
for seq1, seq2, blocks in alignments:
|
Martin@836
|
263 |
ori1 = origins1[seq1]
|
Martin@836
|
264 |
ori2 = origins2[seq2]
|
Martin@640
|
265 |
for beg1, beg2, size in blocks:
|
Martin@640
|
266 |
if beg1 < 0:
|
Martin@640
|
267 |
beg1 = -(beg1 + size)
|
Martin@640
|
268 |
beg2 = -(beg2 + size)
|
Martin@640
|
269 |
if beg2 >= 0:
|
Martin@835
|
270 |
drawLineForward(hits, width, bp_per_pix,
|
Martin@835
|
271 |
beg1 + ori1, beg2 + ori2, size)
|
Martin@640
|
272 |
else:
|
Martin@835
|
273 |
drawLineReverse(hits, width, bp_per_pix,
|
Martin@835
|
274 |
beg1 + ori1, beg2 - ori2, size)
|
Martin@640
|
275 |
return hits
|
Martin@1
|
276 |
|
Martin@650
|
277 |
def expandedSeqDict(seqDict):
|
Martin@650
|
278 |
'''Allow lookup by short sequence names, e.g. chr7 as well as hg19.chr7.'''
|
Martin@650
|
279 |
newDict = {}
|
Martin@650
|
280 |
for name, x in seqDict.items():
|
Martin@650
|
281 |
base = name.split(".")[-1]
|
Martin@650
|
282 |
newDict[name] = x
|
Martin@650
|
283 |
newDict[base] = x
|
Martin@650
|
284 |
return newDict
|
Martin@650
|
285 |
|
Martin@845
|
286 |
def readBed(fileName, seqLimits):
|
Martin@845
|
287 |
if not fileName: return
|
Martin@845
|
288 |
for line in myOpen(fileName):
|
Martin@845
|
289 |
w = line.split()
|
Martin@847
|
290 |
if not w: continue
|
Martin@845
|
291 |
seqName = w[0]
|
Martin@845
|
292 |
if seqName not in seqLimits: continue
|
Martin@845
|
293 |
cropBeg, cropEnd = seqLimits[seqName]
|
Martin@845
|
294 |
beg = int(w[1])
|
Martin@845
|
295 |
end = int(w[2])
|
Martin@845
|
296 |
b = max(beg, cropBeg)
|
Martin@845
|
297 |
e = min(end, cropEnd)
|
Martin@845
|
298 |
if b >= e: continue
|
Martin@856
|
299 |
color = "#ffe4ff"
|
Martin@856
|
300 |
if len(w) > 5:
|
Martin@856
|
301 |
if len(w) > 8 and w[8].count(",") == 2:
|
Martin@856
|
302 |
color = "rgb(" + w[8] + ")"
|
Martin@856
|
303 |
elif w[5] == "+":
|
Martin@856
|
304 |
color = "#fff4f4"
|
Martin@856
|
305 |
elif w[5] == "-":
|
Martin@856
|
306 |
color = "#f4f4ff"
|
Martin@845
|
307 |
yield seqName, b, e, color
|
Martin@845
|
308 |
|
Martin@650
|
309 |
def isExtraFirstGapField(fields):
|
Martin@650
|
310 |
return fields[4].isdigit()
|
Martin@650
|
311 |
|
Martin@839
|
312 |
def readGaps(fileName, seqLimits):
|
Martin@650
|
313 |
'''Read locations of unsequenced gaps, from an agp or gap file.'''
|
Martin@650
|
314 |
if not fileName: return
|
Martin@844
|
315 |
for line in myOpen(fileName):
|
Martin@650
|
316 |
w = line.split()
|
Martin@650
|
317 |
if not w or w[0][0] == "#": continue
|
Martin@650
|
318 |
if isExtraFirstGapField(w): w = w[1:]
|
Martin@650
|
319 |
if w[4] not in "NU": continue
|
Martin@650
|
320 |
seqName = w[0]
|
Martin@839
|
321 |
if seqName not in seqLimits: continue
|
Martin@839
|
322 |
cropBeg, cropEnd = seqLimits[seqName]
|
Martin@650
|
323 |
end = int(w[2])
|
Martin@650
|
324 |
beg = end - int(w[5]) # zero-based coordinate
|
Martin@839
|
325 |
b = max(beg, cropBeg)
|
Martin@839
|
326 |
e = min(end, cropEnd)
|
Martin@839
|
327 |
if b >= e: continue
|
Martin@650
|
328 |
bridgedText = w[7]
|
Martin@839
|
329 |
yield seqName, b, e, bridgedText
|
Martin@650
|
330 |
|
Martin@845
|
331 |
def drawAnnotations(im, beds, origins, margin, limit, isTop, bp_per_pix):
|
Martin@845
|
332 |
# XXX no consideration of different-color overlaps
|
Martin@845
|
333 |
for seqName, beg, end, color in beds:
|
Martin@845
|
334 |
ori = origins[seqName]
|
Martin@845
|
335 |
b = (ori + beg) // bp_per_pix
|
Martin@845
|
336 |
e = div_ceil(ori + end, bp_per_pix)
|
Martin@845
|
337 |
if isTop: box = b, margin, e, limit
|
Martin@845
|
338 |
else: box = margin, b, limit, e
|
Martin@845
|
339 |
im.paste(color, box)
|
Martin@845
|
340 |
|
Martin@836
|
341 |
def drawUnsequencedGaps(im, gaps, origins, margin, limit, isTop, bridgedText,
|
Martin@650
|
342 |
bp_per_pix, color):
|
Martin@650
|
343 |
'''Draw rectangles representing unsequenced gaps.'''
|
Martin@650
|
344 |
for seqName, beg, end, b in gaps:
|
Martin@650
|
345 |
if b != bridgedText: continue
|
Martin@836
|
346 |
ori = origins[seqName]
|
Martin@836
|
347 |
b = div_ceil(ori + beg, bp_per_pix) # use fully-covered pixels only
|
Martin@836
|
348 |
e = (ori + end) // bp_per_pix
|
Martin@650
|
349 |
if e <= b: continue
|
Martin@836
|
350 |
if isTop: box = b, margin, e, limit
|
Martin@836
|
351 |
else: box = margin, b, limit, e
|
Martin@650
|
352 |
im.paste(color, box)
|
Martin@650
|
353 |
|
Martin@1
|
354 |
def make_label(text, text_size, range_start, range_size):
|
Martin@1
|
355 |
'''Return an axis label with endpoint & sort-order information.'''
|
Martin@1
|
356 |
text_width = text_size[0]
|
Martin@1
|
357 |
label_start = range_start + (range_size - text_width) // 2
|
Martin@1
|
358 |
label_end = label_start + text_width
|
Martin@1
|
359 |
sort_key = text_width - range_size
|
Martin@1
|
360 |
return sort_key, label_start, label_end, text
|
Martin@1
|
361 |
|
Martin@645
|
362 |
def get_nonoverlapping_labels(labels, label_space):
|
Martin@1
|
363 |
'''Get a subset of non-overlapping axis labels, greedily.'''
|
Martin@1
|
364 |
nonoverlapping_labels = []
|
Martin@1
|
365 |
for i in labels:
|
Martin@28
|
366 |
if True not in [i[1] < j[2] + label_space and j[1] < i[2] + label_space
|
Martin@28
|
367 |
for j in nonoverlapping_labels]:
|
Martin@1
|
368 |
nonoverlapping_labels.append(i)
|
Martin@1
|
369 |
return nonoverlapping_labels
|
Martin@1
|
370 |
|
Martin@837
|
371 |
def get_axis_image(seqNames, name_sizes, seq_starts, seq_pix,
|
Martin@647
|
372 |
font, image_mode, opts):
|
Martin@1
|
373 |
'''Make an image of axis labels.'''
|
Martin@1
|
374 |
min_pos = seq_starts[0]
|
Martin@1
|
375 |
max_pos = seq_starts[-1] + seq_pix[-1]
|
Martin@28
|
376 |
height = max(zip(*name_sizes)[1])
|
Martin@847
|
377 |
labels = map(make_label, seqNames, name_sizes, seq_starts, seq_pix)
|
Martin@1
|
378 |
labels = [i for i in labels if i[1] >= min_pos and i[2] <= max_pos]
|
Martin@1
|
379 |
labels.sort()
|
Martin@646
|
380 |
labels = get_nonoverlapping_labels(labels, opts.label_space)
|
Martin@1
|
381 |
image_size = max_pos, height
|
Martin@852
|
382 |
im = Image.new(image_mode, image_size, opts.border_color)
|
Martin@1
|
383 |
draw = ImageDraw.Draw(im)
|
Martin@1
|
384 |
for i in labels:
|
Martin@1
|
385 |
position = i[1], 0
|
Martin@646
|
386 |
draw.text(position, i[3], font=font, fill=opts.text_color)
|
Martin@1
|
387 |
return im
|
Martin@1
|
388 |
|
Martin@839
|
389 |
def seqOrigins(seqNames, seq_starts, seqLimits, bp_per_pix):
|
Martin@837
|
390 |
for i, j in zip(seqNames, seq_starts):
|
Martin@839
|
391 |
yield i, bp_per_pix * j - seqLimits[i][0]
|
Martin@836
|
392 |
|
Martin@648
|
393 |
def lastDotplot(opts, args):
|
Martin@643
|
394 |
if opts.fontfile: font = ImageFont.truetype(opts.fontfile, opts.fontsize)
|
Martin@643
|
395 |
else: font = ImageFont.load_default()
|
Martin@641
|
396 |
|
Martin@643
|
397 |
image_mode = 'RGB'
|
Martin@643
|
398 |
forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
|
Martin@643
|
399 |
reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
|
Martin@643
|
400 |
zipped_colors = zip(forward_color, reverse_color)
|
Martin@643
|
401 |
overlap_color = tuple([(i + j) // 2 for i, j in zipped_colors])
|
Martin@641
|
402 |
|
Martin@644
|
403 |
warn("reading alignments...")
|
Martin@851
|
404 |
alignmentInfo = readAlignments(args[0], opts)
|
Martin@851
|
405 |
alignments, seqNames1, seqNames2, seqLimits1, seqLimits2 = alignmentInfo
|
Martin@644
|
406 |
warn("done")
|
Martin@649
|
407 |
if not alignments: raise Exception("there are no alignments")
|
Martin@641
|
408 |
|
Martin@851
|
409 |
i1 = getSeqInfo(opts.sort1, seqNames1, seqLimits1,
|
Martin@851
|
410 |
font, opts.fontsize, image_mode, opts.lengths1)
|
Martin@850
|
411 |
seqNames1, seqSizes1, seqLabels1, labelSizes1, margin1 = i1
|
Martin@846
|
412 |
|
Martin@851
|
413 |
i2 = getSeqInfo(opts.sort2, seqNames2, seqLimits2,
|
Martin@851
|
414 |
font, opts.fontsize, image_mode, opts.lengths2)
|
Martin@850
|
415 |
seqNames2, seqSizes2, seqLabels2, labelSizes2, margin2 = i2
|
Martin@641
|
416 |
|
Martin@644
|
417 |
warn("choosing bp per pixel...")
|
Martin@645
|
418 |
pix_limit1 = opts.width - margin1
|
Martin@645
|
419 |
pix_limit2 = opts.height - margin2
|
Martin@855
|
420 |
bpPerPix1 = get_bp_per_pix(seqSizes1, opts.border_pixels, pix_limit1)
|
Martin@855
|
421 |
bpPerPix2 = get_bp_per_pix(seqSizes2, opts.border_pixels, pix_limit2)
|
Martin@855
|
422 |
bpPerPix = max(bpPerPix1, bpPerPix2)
|
Martin@855
|
423 |
warn("bp per pixel = " + str(bpPerPix))
|
Martin@641
|
424 |
|
Martin@855
|
425 |
seq_pix1, seq_starts1, width = get_pix_info(seqSizes1, bpPerPix,
|
Martin@852
|
426 |
opts.border_pixels, margin1)
|
Martin@855
|
427 |
seq_pix2, seq_starts2, height = get_pix_info(seqSizes2, bpPerPix,
|
Martin@852
|
428 |
opts.border_pixels, margin2)
|
Martin@847
|
429 |
warn("width: " + str(width))
|
Martin@847
|
430 |
warn("height: " + str(height))
|
Martin@839
|
431 |
|
Martin@855
|
432 |
origins1 = dict(seqOrigins(seqNames1, seq_starts1, seqLimits1, bpPerPix))
|
Martin@855
|
433 |
origins2 = dict(seqOrigins(seqNames2, seq_starts2, seqLimits2, bpPerPix))
|
Martin@641
|
434 |
|
Martin@644
|
435 |
warn("processing alignments...")
|
Martin@855
|
436 |
hits = alignmentPixels(width, height, alignments, bpPerPix,
|
Martin@836
|
437 |
origins1, origins2)
|
Martin@644
|
438 |
warn("done")
|
Martin@641
|
439 |
|
Martin@643
|
440 |
image_size = width, height
|
Martin@646
|
441 |
im = Image.new(image_mode, image_size, opts.background_color)
|
Martin@134
|
442 |
|
Martin@845
|
443 |
seqLimits1 = expandedSeqDict(seqLimits1)
|
Martin@845
|
444 |
seqLimits2 = expandedSeqDict(seqLimits2)
|
Martin@836
|
445 |
origins1 = expandedSeqDict(origins1)
|
Martin@836
|
446 |
origins2 = expandedSeqDict(origins2)
|
Martin@845
|
447 |
|
Martin@845
|
448 |
beds1 = list(readBed(opts.bed1, seqLimits1))
|
Martin@845
|
449 |
beds2 = list(readBed(opts.bed2, seqLimits2))
|
Martin@855
|
450 |
drawAnnotations(im, beds1, origins1, margin2, height, True, bpPerPix)
|
Martin@855
|
451 |
drawAnnotations(im, beds2, origins2, margin1, width, False, bpPerPix)
|
Martin@845
|
452 |
|
Martin@839
|
453 |
gaps1 = list(readGaps(opts.gap1, seqLimits1))
|
Martin@839
|
454 |
gaps2 = list(readGaps(opts.gap2, seqLimits2))
|
Martin@650
|
455 |
# draw bridged gaps first, then unbridged gaps on top:
|
Martin@836
|
456 |
drawUnsequencedGaps(im, gaps1, origins1, margin2, height, True, "yes",
|
Martin@855
|
457 |
bpPerPix, opts.bridged_color)
|
Martin@836
|
458 |
drawUnsequencedGaps(im, gaps2, origins2, margin1, width, False, "yes",
|
Martin@855
|
459 |
bpPerPix, opts.bridged_color)
|
Martin@836
|
460 |
drawUnsequencedGaps(im, gaps1, origins1, margin2, height, True, "no",
|
Martin@855
|
461 |
bpPerPix, opts.unbridged_color)
|
Martin@836
|
462 |
drawUnsequencedGaps(im, gaps2, origins2, margin1, width, False, "no",
|
Martin@855
|
463 |
bpPerPix, opts.unbridged_color)
|
Martin@650
|
464 |
|
Martin@643
|
465 |
for i in range(height):
|
Martin@643
|
466 |
for j in range(width):
|
Martin@643
|
467 |
store_value = hits[i * width + j]
|
Martin@643
|
468 |
xy = j, i
|
Martin@643
|
469 |
if store_value == 1: im.putpixel(xy, forward_color)
|
Martin@643
|
470 |
elif store_value == 2: im.putpixel(xy, reverse_color)
|
Martin@643
|
471 |
elif store_value == 3: im.putpixel(xy, overlap_color)
|
Martin@95
|
472 |
|
Martin@643
|
473 |
if opts.fontsize != 0:
|
Martin@846
|
474 |
axis1 = get_axis_image(seqLabels1, labelSizes1, seq_starts1, seq_pix1,
|
Martin@647
|
475 |
font, image_mode, opts)
|
Martin@846
|
476 |
axis2 = get_axis_image(seqLabels2, labelSizes2, seq_starts2, seq_pix2,
|
Martin@647
|
477 |
font, image_mode, opts)
|
Martin@834
|
478 |
axis2 = axis2.transpose(Image.ROTATE_270) # !!! bug hotspot
|
Martin@643
|
479 |
im.paste(axis1, (0, 0))
|
Martin@643
|
480 |
im.paste(axis2, (0, 0))
|
Martin@1
|
481 |
|
Martin@643
|
482 |
for i in seq_starts1[1:]:
|
Martin@852
|
483 |
box = i - opts.border_pixels, margin2, i, height
|
Martin@852
|
484 |
im.paste(opts.border_color, box)
|
Martin@1
|
485 |
|
Martin@643
|
486 |
for i in seq_starts2[1:]:
|
Martin@852
|
487 |
box = margin1, i - opts.border_pixels, width, i
|
Martin@852
|
488 |
im.paste(opts.border_color, box)
|
Martin@1
|
489 |
|
Martin@643
|
490 |
im.save(args[1])
|
Martin@648
|
491 |
|
Martin@648
|
492 |
if __name__ == "__main__":
|
Martin@649
|
493 |
usage = """%prog --help
|
Martin@649
|
494 |
or: %prog [options] maf-or-tab-alignments dotplot.png
|
Martin@649
|
495 |
or: %prog [options] maf-or-tab-alignments dotplot.gif
|
Martin@649
|
496 |
or: ..."""
|
Martin@649
|
497 |
description = "Draw a dotplot of pair-wise sequence alignments in MAF or tabular format."
|
Martin@649
|
498 |
op = optparse.OptionParser(usage=usage, description=description)
|
Martin@651
|
499 |
op.add_option("-1", "--seq1", metavar="PATTERN", action="append",
|
Martin@840
|
500 |
default=[],
|
Martin@651
|
501 |
help="which sequences to show from the 1st genome")
|
Martin@651
|
502 |
op.add_option("-2", "--seq2", metavar="PATTERN", action="append",
|
Martin@840
|
503 |
default=[],
|
Martin@651
|
504 |
help="which sequences to show from the 2nd genome")
|
Martin@648
|
505 |
# Replace "width" & "height" with a single "length" option?
|
Martin@648
|
506 |
op.add_option("-x", "--width", type="int", default=1000,
|
Martin@648
|
507 |
help="maximum width in pixels (default: %default)")
|
Martin@648
|
508 |
op.add_option("-y", "--height", type="int", default=1000,
|
Martin@648
|
509 |
help="maximum height in pixels (default: %default)")
|
Martin@649
|
510 |
op.add_option("-c", "--forwardcolor", metavar="COLOR", default="red",
|
Martin@649
|
511 |
help="color for forward alignments (default: %default)")
|
Martin@649
|
512 |
op.add_option("-r", "--reversecolor", metavar="COLOR", default="blue",
|
Martin@649
|
513 |
help="color for reverse alignments (default: %default)")
|
Martin@851
|
514 |
op.add_option("--sort1", type="int", default=1, metavar="N",
|
Martin@851
|
515 |
help="genome1 sequence order: 0=input order, 1=name order, "
|
Martin@851
|
516 |
"2=length order (default=%default)")
|
Martin@851
|
517 |
op.add_option("--sort2", type="int", default=1, metavar="N",
|
Martin@851
|
518 |
help="genome2 sequence order: 0=input order, 1=name order, "
|
Martin@851
|
519 |
"2=length order (default=%default)")
|
Martin@839
|
520 |
op.add_option("--trim1", action="store_true",
|
Martin@839
|
521 |
help="trim unaligned sequence flanks from the 1st genome")
|
Martin@839
|
522 |
op.add_option("--trim2", action="store_true",
|
Martin@839
|
523 |
help="trim unaligned sequence flanks from the 2nd genome")
|
Martin@852
|
524 |
op.add_option("--border-pixels", metavar="INT", type="int", default=1,
|
Martin@852
|
525 |
help="number of pixels between sequences (default=%default)")
|
Martin@852
|
526 |
op.add_option("--border-color", metavar="COLOR", default="#dcdcdc",
|
Martin@852
|
527 |
help="color for pixels between sequences (default=%default)")
|
Martin@852
|
528 |
# xxx --margin-color?
|
Martin@845
|
529 |
op.add_option("--bed1", metavar="FILE",
|
Martin@845
|
530 |
help="read genome1 annotations from bed file")
|
Martin@845
|
531 |
op.add_option("--bed2", metavar="FILE",
|
Martin@845
|
532 |
help="read genome2 annotations from bed file")
|
Martin@846
|
533 |
|
Martin@850
|
534 |
og = optparse.OptionGroup(op, "Text options")
|
Martin@850
|
535 |
og.add_option("-f", "--fontfile", metavar="FILE",
|
Martin@850
|
536 |
help="TrueType or OpenType font file")
|
Martin@850
|
537 |
og.add_option("-s", "--fontsize", metavar="SIZE", type="int", default=11,
|
Martin@850
|
538 |
help="TrueType or OpenType font size (default: %default)")
|
Martin@850
|
539 |
og.add_option("--lengths1", action="store_true",
|
Martin@850
|
540 |
help="show sequence lengths for the 1st (horizontal) genome")
|
Martin@850
|
541 |
og.add_option("--lengths2", action="store_true",
|
Martin@850
|
542 |
help="show sequence lengths for the 2nd (vertical) genome")
|
Martin@850
|
543 |
op.add_option_group(og)
|
Martin@850
|
544 |
|
Martin@650
|
545 |
og = optparse.OptionGroup(op, "Unsequenced gap options")
|
Martin@650
|
546 |
og.add_option("--gap1", metavar="FILE",
|
Martin@650
|
547 |
help="read genome1 unsequenced gaps from agp or gap file")
|
Martin@650
|
548 |
og.add_option("--gap2", metavar="FILE",
|
Martin@650
|
549 |
help="read genome2 unsequenced gaps from agp or gap file")
|
Martin@650
|
550 |
og.add_option("--bridged-color", metavar="COLOR", default="yellow",
|
Martin@650
|
551 |
help="color for bridged gaps (default: %default)")
|
Martin@650
|
552 |
og.add_option("--unbridged-color", metavar="COLOR", default="pink",
|
Martin@650
|
553 |
help="color for unbridged gaps (default: %default)")
|
Martin@650
|
554 |
op.add_option_group(og)
|
Martin@648
|
555 |
(opts, args) = op.parse_args()
|
Martin@648
|
556 |
if len(args) != 2: op.error("2 arguments needed")
|
Martin@648
|
557 |
|
Martin@648
|
558 |
opts.text_color = "black"
|
Martin@648
|
559 |
opts.background_color = "white"
|
Martin@648
|
560 |
opts.label_space = 5 # minimum number of pixels between axis labels
|
Martin@648
|
561 |
|
Martin@649
|
562 |
try: lastDotplot(opts, args)
|
Martin@649
|
563 |
except KeyboardInterrupt: pass # avoid silly error message
|
Martin@649
|
564 |
except Exception, e:
|
Martin@649
|
565 |
prog = os.path.basename(sys.argv[0])
|
Martin@649
|
566 |
sys.exit(prog + ": error: " + str(e))
|