|
1 | 1 | #!/usr/bin/env python3 |
2 | 2 | # -*- coding: utf-8 -*- |
3 | 3 |
|
4 | | -# Copyright 2013-2023 by Björn Johansson. All rights reserved. |
5 | | -# This code is part of the Python-dna distribution and governed by its |
6 | | -# license. Please see the LICENSE.txt file that should have been included |
7 | | -# as part of this package. |
8 | | -"""Provides the Dseq class for handling double stranded DNA sequences. |
9 | | -
|
10 | | -Dseq is a subclass of :class:`Bio.Seq.Seq`. The Dseq class |
11 | | -is mostly useful as a part of the :class:`pydna.dseqrecord.Dseqrecord` class |
12 | | -which can hold more meta data. |
13 | | -
|
14 | | -The Dseq class support the notion of circular and linear DNA topology. |
15 | 4 | """ |
| 5 | +Utilities for CRISPR/Cas target searching and protospacer extraction. |
16 | 6 |
|
17 | | -from abc import ABC, abstractmethod |
| 7 | +""" |
18 | 8 | import re |
19 | | -from pydna.utils import rc |
| 9 | +from abc import ABC |
| 10 | +from abc import abstractmethod |
| 11 | +from typing import Type |
| 12 | +from typing import TYPE_CHECKING |
| 13 | +from typing import List |
| 14 | +from typing import TypeVar |
| 15 | + |
| 16 | +if TYPE_CHECKING: # pragma: no cover |
| 17 | + from pydna.dseqrecord import Dseqrecord |
| 18 | + |
| 19 | +DseqrecordType = TypeVar("DseqrecordType", bound="Dseqrecord") |
20 | 20 |
|
21 | 21 |
|
22 | 22 | class _cas(ABC): |
23 | | - scaffold = "ND" |
24 | | - pam = "ND" |
25 | | - size = 0 |
26 | | - fst5 = 0 |
27 | | - fst3 = 0 |
28 | | - |
29 | | - def __init__(self, protospacer): |
30 | | - self.protospacer = protospacer.upper() |
31 | | - self.compsite = re.compile( |
32 | | - f"(?=(?P<watson>{self.protospacer}{self.pam}))|(?=(?P<crick>{rc(self.pam)}{rc(self.protospacer)}))", |
33 | | - re.UNICODE, |
34 | | - ) |
| 23 | + """ |
| 24 | + Abstract base class for CRISPR-associated nucleases. |
| 25 | + pam, scaffold and cut location is set by a subclass |
| 26 | + such as Cas9 below. |
| 27 | +
|
| 28 | + The meaning of size, fst5 and fst3 are the same as for the restriciton |
| 29 | + enzymes in the Biopython restriction module (Bio.Restriction). |
| 30 | + """ |
| 31 | + |
| 32 | + scaffold: str = "ND" |
| 33 | + pam: str = "ND" |
| 34 | + size: int = 0 |
| 35 | + fst5: int = 0 |
| 36 | + fst3: int = 0 |
| 37 | + |
| 38 | + def __init__(self, protospacer: str) -> None: |
| 39 | + """ |
| 40 | + Initialize the nuclease with a protospacer sequence. |
| 41 | + The sequence is a string. Use the protospacer function |
| 42 | + to extract a sequence from a Dseqrecord. |
| 43 | +
|
| 44 | + Args: |
| 45 | + protospacer: Protospacer sequence used to build the search pattern. |
| 46 | + """ |
| 47 | + from pydna.sequence_regex import compute_regex_site |
| 48 | + |
| 49 | + self.protospacer: str = protospacer.upper() |
| 50 | + self.compsite = compute_regex_site(f"{self.protospacer}{self.pam}") |
35 | 51 |
|
36 | 52 | @abstractmethod |
37 | | - def search(self, dna, linear=True): |
38 | | - """To override in subclass.""" |
39 | | - pass |
| 53 | + def search(self, dna, linear: bool = True) -> List[int]: |
| 54 | + """Return a list of cutting sites of the enzyme in the sequence. |
| 55 | +
|
| 56 | + dna must be an instance of: |
| 57 | +
|
| 58 | + - pydna.dseq.Dseq |
| 59 | + - Bio.Seq.Seq |
| 60 | + - Bio.Seq.MutableSeq |
40 | 61 |
|
41 | | - def __repr__(self): |
| 62 | + pydna.dseqrecord.Dseqrecord or Bio.SeqRecord.SeqRecord will not work. |
| 63 | + This limitation is by design t omirror enzymes in the |
| 64 | + Biopython Bio.Restriction class |
| 65 | +
|
| 66 | + The linear argument is laso there for compatibility with the |
| 67 | + Biopython Bio.Restriction class. |
| 68 | +
|
| 69 | + An important caveat is that search ignores the circular property of |
| 70 | + pydna.dseq.Dseq. |
| 71 | +
|
| 72 | + If linear is False, the restriction sites that span over the boundaries |
| 73 | + will be included. |
| 74 | +
|
| 75 | + The positions are the first base of the 3' fragment, |
| 76 | + i.e. the first base after the position the enzyme will cut. |
| 77 | + """ |
| 78 | + raise NotImplementedError # pragma: no cover |
| 79 | + |
| 80 | + def __repr__(self) -> str: |
| 81 | + """ |
| 82 | + Return a compact representation of the Cas9+gRNA nuclease instance. |
| 83 | +
|
| 84 | + Returns: |
| 85 | + String representation with abbreviated protospacer. |
| 86 | + """ |
42 | 87 | return f"{type(self).__name__}({self.protospacer[:3]}..{self.protospacer[-3:]})" |
43 | 88 |
|
44 | | - @abstractmethod |
45 | | - def __str__(self): |
46 | | - """To override in subclass.""" |
47 | | - pass |
| 89 | + def __str__(self) -> str: |
| 90 | + """ |
| 91 | + Return the guide RNA protospacer and scaffold as FASTA-like string. |
| 92 | + """ |
| 93 | + return f">{type(self).__name__} protospacer scaffold\n{self.protospacer} {self.scaffold}" |
48 | 94 |
|
49 | 95 |
|
50 | 96 | class cas9(_cas): |
51 | 97 | """docstring. |
52 | 98 |
|
53 | 99 | .. code-block:: |
54 | 100 |
|
55 | | - |----size----------| |
56 | 101 |
|
57 | | - ---protospacer------ |
58 | | - -fst3 |
59 | | - fst5 |-| |
60 | | - |--------------| |
61 | | - PAM |
62 | | - 5-NNGGAAGAGTAATACACTA-AAANGGNN-3 |
63 | | - ||||||||||||||||||| |||||||| |
64 | | - 3-NNCCTTCTCATTATGTGAT-TTTNCCNN-5 |
| 102 | + fst5 --|fst3 |
| 103 | + |---------------- |
| 104 | + PAM |
| 105 | + 5'-NNGGAAGAGTAATACACTA-AAANGGNN-3' |
| 106 | + ||||||||||||||||||| |||||||| |
| 107 | + 3'-NNCCTTCTCATTATGTGAT-TTTNCCNN-5' |
65 | 108 | ||||||||||||||||| ||| |
66 | | - 5-GGAAGAGTAATACACTA-AAAg-u-a-a-g-g Scaffold |
67 | | - ---gRNA spacer--- u-a |
68 | | - u-a |
69 | | - u-a |
70 | | - u-a |
71 | | - a-u |
72 | | - g-u-g |
73 | | - a a |
74 | | - g-c-a |
75 | | - c-g |
76 | | - u-a |
77 | | - a-u |
78 | | - g a tetraloop |
79 | | - a-a |
| 109 | + 5'-GGAAGAGTAATACACTA AAA-g-u-a-a-g-g-3' Scaffold (lower case) |
| 110 | + ---gRNA spacer------- u-a |
| 111 | + u-a |
| 112 | + u-a |
| 113 | + u-a |
| 114 | + a-u |
| 115 | + g-u-g |
| 116 | + a a |
| 117 | + g-c-a |
| 118 | + c-g |
| 119 | + u-a |
| 120 | + a-u |
| 121 | + g a |
| 122 | + a-a |
80 | 123 | """ |
81 | 124 |
|
82 | | - scaffold = "GTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGG" |
83 | | - pam = ".GG" |
84 | | - size = 20 |
85 | | - fst5 = 17 |
86 | | - fst3 = -3 |
87 | | - ovhg = fst5 - (size + fst3) |
88 | | - |
89 | | - def search(self, dna, linear=True): |
90 | | - """docstring.""" |
91 | | - dna = str(dna).upper() |
92 | | - if linear: |
93 | | - dna = dna |
94 | | - else: |
95 | | - dna = dna + dna[1 : self.size] |
96 | | - results = [] |
97 | | - for mobj in self.compsite.finditer(dna): |
98 | | - w, c = mobj.groups() |
99 | | - if w: |
100 | | - results.append(mobj.start("watson") + 1 + self.fst5) |
101 | | - if c: |
102 | | - results.append(mobj.start("crick") + len(self.pam) + 1 - self.fst3) |
| 125 | + scaffold: str = "GTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGG" |
| 126 | + pam: str = ".GG" |
| 127 | + size: int = 20 |
| 128 | + fst5: int = 17 |
| 129 | + fst3: int = -3 |
| 130 | + ovhg: int = fst5 - (size + fst3) |
| 131 | + |
| 132 | + def search(self, dna, linear: bool = True) -> List[int]: |
| 133 | + """ |
| 134 | + Search for Cas9 target sites in a DNA sequence. |
| 135 | +
|
| 136 | + Args: |
| 137 | + dna: string, Bio.Seq.Seq or pydna.dseq.Dseq |
| 138 | + linear: Whether the DNA is linear or circular. |
| 139 | +
|
| 140 | + Returns: |
| 141 | + A list of cut site positions. |
| 142 | + """ |
| 143 | + from pydna.dseqrecord import Dseqrecord |
| 144 | + from pydna.sequence_regex import dseqrecord_finditer |
| 145 | + |
| 146 | + if not hasattr(dna, "_data"): |
| 147 | + raise TypeError |
| 148 | + results: List[int] = [] |
| 149 | + query = Dseqrecord(dna, circular=(not linear)) |
| 150 | + matches_fwd = dseqrecord_finditer(self.compsite, query) |
| 151 | + matches_rev = dseqrecord_finditer(self.compsite, query.reverse_complement()) |
| 152 | + for mobj in matches_fwd: |
| 153 | + results.append((mobj.start() + self.fst5 + 1) % len(dna)) |
| 154 | + for mobj in matches_rev: |
| 155 | + results.append((len(dna) - (mobj.start() + self.fst5) + 1) % len(dna)) |
103 | 156 | return results |
104 | 157 |
|
105 | | - def __str__(self): |
106 | | - """docstring.""" |
107 | | - return f">{type(self).__name__} protospacer scaffold\n{self.protospacer} {self.scaffold}" |
108 | 158 |
|
| 159 | +def protospacer(guide_construct: DseqrecordType, cas: Type[_cas] = cas9) -> List[str]: |
| 160 | + """ |
| 161 | + Extract protospacer sequences from a guide construct. This can for example |
| 162 | + be a plasmid containing the guide construct. This function returns a |
| 163 | + list since a several protospacers can be present. |
109 | 164 |
|
110 | | -def protospacer(guide_construct, cas=cas9): |
111 | | - """docstring.""" |
112 | | - in_watson = [ |
113 | | - mobj.group("ps") |
114 | | - for mobj in re.finditer( |
115 | | - f"(?P<ps>.{{{cas.size}}})(?:{cas.scaffold})", |
116 | | - str(guide_construct.seq).upper(), |
117 | | - ) |
118 | | - ] |
119 | | - in_crick = [ |
120 | | - rc(mobj.group("ps")) |
121 | | - for mobj in re.finditer( |
122 | | - f"(?:{rc(cas.scaffold)})(?P<ps>.{{{cas.size}}})", |
123 | | - str(guide_construct.seq).upper(), |
| 165 | + Args: |
| 166 | + guide_construct: Sequence construct containing protospacer and scaffold. |
| 167 | + cas: CRISPR nuclease class defining spacer size and scaffold. |
| 168 | +
|
| 169 | + Returns: |
| 170 | + A list of protospacer sequences found in Watson and Crick orientations. |
| 171 | + """ |
| 172 | + if guide_construct.circular: |
| 173 | + total_length = cas.size + len(cas.scaffold) |
| 174 | + guide_construct = guide_construct[:] + guide_construct[: total_length - 1] |
| 175 | + |
| 176 | + result = [] |
| 177 | + |
| 178 | + for s in guide_construct.seq.watson.upper(), guide_construct.seq.crick.upper(): |
| 179 | + result.extend( |
| 180 | + mobj.group("ps") |
| 181 | + for mobj in re.finditer( |
| 182 | + f"(?P<ps>.{{{cas.size}}})(?:{cas.scaffold})", |
| 183 | + s, |
| 184 | + ) |
124 | 185 | ) |
125 | | - ] |
126 | | - return in_watson + in_crick |
| 186 | + |
| 187 | + return result |
0 commit comments