Skip to content

Commit cbe5cf1

Browse files
Bug in crispr (#582)
* fix bug with protospacers from a circular construct, more and better docstrings * more tests for crispr * added types to crispr module. Could not use pydna.types because of circular import * new tests and higher coverage * alternative reusing dseqrecord_finditer) (#585) * fix bug and add test --------- Co-authored-by: Manuel Lera-Ramirez <manulera14@gmail.com>
1 parent 8bc4a62 commit cbe5cf1

2 files changed

Lines changed: 463 additions & 106 deletions

File tree

src/pydna/crispr.py

Lines changed: 157 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,126 +1,187 @@
11
#!/usr/bin/env python3
22
# -*- coding: utf-8 -*-
33

4-
# Copyright 2013-2023 by Björn Johansson. All rights reserved.
5-
# This code is part of the Python-dna distribution and governed by its
6-
# license. Please see the LICENSE.txt file that should have been included
7-
# as part of this package.
8-
"""Provides the Dseq class for handling double stranded DNA sequences.
9-
10-
Dseq is a subclass of :class:`Bio.Seq.Seq`. The Dseq class
11-
is mostly useful as a part of the :class:`pydna.dseqrecord.Dseqrecord` class
12-
which can hold more meta data.
13-
14-
The Dseq class support the notion of circular and linear DNA topology.
154
"""
5+
Utilities for CRISPR/Cas target searching and protospacer extraction.
166
17-
from abc import ABC, abstractmethod
7+
"""
188
import re
19-
from pydna.utils import rc
9+
from abc import ABC
10+
from abc import abstractmethod
11+
from typing import Type
12+
from typing import TYPE_CHECKING
13+
from typing import List
14+
from typing import TypeVar
15+
16+
if TYPE_CHECKING: # pragma: no cover
17+
from pydna.dseqrecord import Dseqrecord
18+
19+
DseqrecordType = TypeVar("DseqrecordType", bound="Dseqrecord")
2020

2121

2222
class _cas(ABC):
23-
scaffold = "ND"
24-
pam = "ND"
25-
size = 0
26-
fst5 = 0
27-
fst3 = 0
28-
29-
def __init__(self, protospacer):
30-
self.protospacer = protospacer.upper()
31-
self.compsite = re.compile(
32-
f"(?=(?P<watson>{self.protospacer}{self.pam}))|(?=(?P<crick>{rc(self.pam)}{rc(self.protospacer)}))",
33-
re.UNICODE,
34-
)
23+
"""
24+
Abstract base class for CRISPR-associated nucleases.
25+
pam, scaffold and cut location is set by a subclass
26+
such as Cas9 below.
27+
28+
The meaning of size, fst5 and fst3 are the same as for the restriciton
29+
enzymes in the Biopython restriction module (Bio.Restriction).
30+
"""
31+
32+
scaffold: str = "ND"
33+
pam: str = "ND"
34+
size: int = 0
35+
fst5: int = 0
36+
fst3: int = 0
37+
38+
def __init__(self, protospacer: str) -> None:
39+
"""
40+
Initialize the nuclease with a protospacer sequence.
41+
The sequence is a string. Use the protospacer function
42+
to extract a sequence from a Dseqrecord.
43+
44+
Args:
45+
protospacer: Protospacer sequence used to build the search pattern.
46+
"""
47+
from pydna.sequence_regex import compute_regex_site
48+
49+
self.protospacer: str = protospacer.upper()
50+
self.compsite = compute_regex_site(f"{self.protospacer}{self.pam}")
3551

3652
@abstractmethod
37-
def search(self, dna, linear=True):
38-
"""To override in subclass."""
39-
pass
53+
def search(self, dna, linear: bool = True) -> List[int]:
54+
"""Return a list of cutting sites of the enzyme in the sequence.
55+
56+
dna must be an instance of:
57+
58+
- pydna.dseq.Dseq
59+
- Bio.Seq.Seq
60+
- Bio.Seq.MutableSeq
4061
41-
def __repr__(self):
62+
pydna.dseqrecord.Dseqrecord or Bio.SeqRecord.SeqRecord will not work.
63+
This limitation is by design t omirror enzymes in the
64+
Biopython Bio.Restriction class
65+
66+
The linear argument is laso there for compatibility with the
67+
Biopython Bio.Restriction class.
68+
69+
An important caveat is that search ignores the circular property of
70+
pydna.dseq.Dseq.
71+
72+
If linear is False, the restriction sites that span over the boundaries
73+
will be included.
74+
75+
The positions are the first base of the 3' fragment,
76+
i.e. the first base after the position the enzyme will cut.
77+
"""
78+
raise NotImplementedError # pragma: no cover
79+
80+
def __repr__(self) -> str:
81+
"""
82+
Return a compact representation of the Cas9+gRNA nuclease instance.
83+
84+
Returns:
85+
String representation with abbreviated protospacer.
86+
"""
4287
return f"{type(self).__name__}({self.protospacer[:3]}..{self.protospacer[-3:]})"
4388

44-
@abstractmethod
45-
def __str__(self):
46-
"""To override in subclass."""
47-
pass
89+
def __str__(self) -> str:
90+
"""
91+
Return the guide RNA protospacer and scaffold as FASTA-like string.
92+
"""
93+
return f">{type(self).__name__} protospacer scaffold\n{self.protospacer} {self.scaffold}"
4894

4995

5096
class cas9(_cas):
5197
"""docstring.
5298
5399
.. code-block::
54100
55-
|----size----------|
56101
57-
---protospacer------
58-
-fst3
59-
fst5 |-|
60-
|--------------|
61-
PAM
62-
5-NNGGAAGAGTAATACACTA-AAANGGNN-3
63-
||||||||||||||||||| ||||||||
64-
3-NNCCTTCTCATTATGTGAT-TTTNCCNN-5
102+
fst5 --|fst3
103+
|----------------
104+
PAM
105+
5'-NNGGAAGAGTAATACACTA-AAANGGNN-3'
106+
||||||||||||||||||| ||||||||
107+
3'-NNCCTTCTCATTATGTGAT-TTTNCCNN-5'
65108
||||||||||||||||| |||
66-
5-GGAAGAGTAATACACTA-AAAg-u-a-a-g-g Scaffold
67-
---gRNA spacer--- u-a
68-
u-a
69-
u-a
70-
u-a
71-
a-u
72-
g-u-g
73-
a a
74-
g-c-a
75-
c-g
76-
u-a
77-
a-u
78-
g a tetraloop
79-
a-a
109+
5'-GGAAGAGTAATACACTA AAA-g-u-a-a-g-g-3' Scaffold (lower case)
110+
---gRNA spacer------- u-a
111+
u-a
112+
u-a
113+
u-a
114+
a-u
115+
g-u-g
116+
a a
117+
g-c-a
118+
c-g
119+
u-a
120+
a-u
121+
g a
122+
a-a
80123
"""
81124

82-
scaffold = "GTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGG"
83-
pam = ".GG"
84-
size = 20
85-
fst5 = 17
86-
fst3 = -3
87-
ovhg = fst5 - (size + fst3)
88-
89-
def search(self, dna, linear=True):
90-
"""docstring."""
91-
dna = str(dna).upper()
92-
if linear:
93-
dna = dna
94-
else:
95-
dna = dna + dna[1 : self.size]
96-
results = []
97-
for mobj in self.compsite.finditer(dna):
98-
w, c = mobj.groups()
99-
if w:
100-
results.append(mobj.start("watson") + 1 + self.fst5)
101-
if c:
102-
results.append(mobj.start("crick") + len(self.pam) + 1 - self.fst3)
125+
scaffold: str = "GTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGG"
126+
pam: str = ".GG"
127+
size: int = 20
128+
fst5: int = 17
129+
fst3: int = -3
130+
ovhg: int = fst5 - (size + fst3)
131+
132+
def search(self, dna, linear: bool = True) -> List[int]:
133+
"""
134+
Search for Cas9 target sites in a DNA sequence.
135+
136+
Args:
137+
dna: string, Bio.Seq.Seq or pydna.dseq.Dseq
138+
linear: Whether the DNA is linear or circular.
139+
140+
Returns:
141+
A list of cut site positions.
142+
"""
143+
from pydna.dseqrecord import Dseqrecord
144+
from pydna.sequence_regex import dseqrecord_finditer
145+
146+
if not hasattr(dna, "_data"):
147+
raise TypeError
148+
results: List[int] = []
149+
query = Dseqrecord(dna, circular=(not linear))
150+
matches_fwd = dseqrecord_finditer(self.compsite, query)
151+
matches_rev = dseqrecord_finditer(self.compsite, query.reverse_complement())
152+
for mobj in matches_fwd:
153+
results.append((mobj.start() + self.fst5 + 1) % len(dna))
154+
for mobj in matches_rev:
155+
results.append((len(dna) - (mobj.start() + self.fst5) + 1) % len(dna))
103156
return results
104157

105-
def __str__(self):
106-
"""docstring."""
107-
return f">{type(self).__name__} protospacer scaffold\n{self.protospacer} {self.scaffold}"
108158

159+
def protospacer(guide_construct: DseqrecordType, cas: Type[_cas] = cas9) -> List[str]:
160+
"""
161+
Extract protospacer sequences from a guide construct. This can for example
162+
be a plasmid containing the guide construct. This function returns a
163+
list since a several protospacers can be present.
109164
110-
def protospacer(guide_construct, cas=cas9):
111-
"""docstring."""
112-
in_watson = [
113-
mobj.group("ps")
114-
for mobj in re.finditer(
115-
f"(?P<ps>.{{{cas.size}}})(?:{cas.scaffold})",
116-
str(guide_construct.seq).upper(),
117-
)
118-
]
119-
in_crick = [
120-
rc(mobj.group("ps"))
121-
for mobj in re.finditer(
122-
f"(?:{rc(cas.scaffold)})(?P<ps>.{{{cas.size}}})",
123-
str(guide_construct.seq).upper(),
165+
Args:
166+
guide_construct: Sequence construct containing protospacer and scaffold.
167+
cas: CRISPR nuclease class defining spacer size and scaffold.
168+
169+
Returns:
170+
A list of protospacer sequences found in Watson and Crick orientations.
171+
"""
172+
if guide_construct.circular:
173+
total_length = cas.size + len(cas.scaffold)
174+
guide_construct = guide_construct[:] + guide_construct[: total_length - 1]
175+
176+
result = []
177+
178+
for s in guide_construct.seq.watson.upper(), guide_construct.seq.crick.upper():
179+
result.extend(
180+
mobj.group("ps")
181+
for mobj in re.finditer(
182+
f"(?P<ps>.{{{cas.size}}})(?:{cas.scaffold})",
183+
s,
184+
)
124185
)
125-
]
126-
return in_watson + in_crick
186+
187+
return result

0 commit comments

Comments
 (0)