diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..fce2bdccc --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +* text=auto +data/* binary +src/* text=lf +R/* text=lf diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..4518497da --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.Rproj.user +.Rhistory +.RData +.RDataTmp +src/*.o +src/*.so +.DS_Store diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..a7fd13358 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,7 @@ +# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r + +language: R +sudo: false +cache: packages +warnings_are_errors: false +dist: trusty diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 000000000..d94e72009 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,88 @@ +Package: Seurat +Version: 2.0.0.900 +Date: 07/17/17 +Title: Seurat : R toolkit for single cell genomics +Description: Seurat : R toolkit for single cell genomics. +Author: Rahul Satija +Maintainer: Satija Lab +URL: http://www.satijalab.org/seurat +Depends: + R (>= 3.2.0), + ggplot2, + cowplot, +SystemRequirements: Java (>= 1.6) +Imports: + methods, + ROCR, + stringr, + mixtools, + lars, + ica, + tsne, + Rtsne, + fpc, + ape, + VGAM, + pbapply, + igraph, + FNN, + caret, + e1071, + dplyr, + RColorBrewer, + MASS, + Matrix, + irlba, + reshape2, + gridExtra, + gplots, + gdata, + Rcpp, + RcppEigen, + RcppProgress, + tclust, + ranger, + compositions, + NMOF, + dtw, + SDMTools, + plotly, + diffusionMap, + Hmisc, + httr, + tidyr +LinkingTo: Rcpp, RcppEigen, RcppProgress +License: GPL-3 +Collate: + 'RcppExports.R' + 'seurat.R' + 'cluster_determination.R' + 'cluster_determination_internal.R' + 'cluster_validation.R' + 'deprecated_functions.R' + 'differential_expression.R' + 'differential_expression_internal.R' + 'dimensional_reduction.R' + 'dimensional_reduction_internal.R' + 'dimensional_reduction_utilities.R' + 'interaction.R' + 'jackstraw.R' + 'jackstraw_internal.R' + 'multi_modal.R' + 'plotting.R' + 'plotting_internal.R' + 'plotting_utilities.R' + 'preprocessing.R' + 'preprocessing_internal.R' + 'printing_utilities.R' + 'seuratFxns.R' + 'snn.R' + 'spatial.R' + 'spatial_internal.R' + 'tSNE_project.R' + 'utilities.R' + 'utilities_internal.R' + 'zfRenderSeurat.R' +RoxygenNote: 6.0.1 +Suggests: + testthat diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 000000000..9cecc1d46 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + {one line to give the program's name and a brief idea of what it does.} + Copyright (C) {year} {name of author} + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + {project} Copyright (C) {year} {fullname} + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 000000000..d5bd1ab22 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,324 @@ +# Generated by roxygen2: do not edit by hand + +export(AddImputedScore) +export(AddMetaData) +export(AddSamples) +export(AddSmoothedScore) +export(AlignSubspace) +export(AssessNodes) +export(AssessSplit) +export(AverageDetectionRate) +export(AverageExpression) +export(AveragePCA) +export(BuildClusterTree) +export(BuildRFClassifier) +export(BuildSNN) +export(CalcVarExpRatio) +export(CellCentroid) +export(CellPlot) +export(ClassifyCells) +export(ColorTSNESplit) +export(CreateSeuratObject) +export(CustomDistance) +export(CustomPalette) +export(DBClustDimension) +export(DBclust_dimension) +export(DMEmbed) +export(DMLoad) +export(DMPlot) +export(DarkTheme) +export(DiffExpTest) +export(DiffTTest) +export(DimElbowPlot) +export(DimHeatmap) +export(DimPlot) +export(DimTopCells) +export(DimTopGenes) +export(DoHeatmap) +export(DoKMeans) +export(DotPlot) +export(DotPlotGG) +export(ExactCellCentroid) +export(ExpMean) +export(ExpSD) +export(ExpVar) +export(ExtractField) +export(FastWhichCells) +export(FeatureHeatmap) +export(FeatureLocator) +export(FeaturePlot) +export(FetchData) +export(FilterCells) +export(FindAllMarkers) +export(FindAllMarkersNode) +export(FindClusters) +export(FindConservedMarkers) +export(FindMarkers) +export(FindMarkersNode) +export(FindVariableGenes) +export(FitGeneK) +export(FitGeneMix) +export(GenePlot) +export(GenesInCluster) +export(GetAssayData) +export(GetCellEmbeddings) +export(GetCentroids) +export(GetClusters) +export(GetDimReduction) +export(GetGeneLoadings) +export(HeatmapNode) +export(HoverLocator) +export(ICA) +export(ICAEmbed) +export(ICALoad) +export(ICAPlot) +export(ICHeatmap) +export(ICTopCells) +export(ICTopGenes) +export(InitialMapping) +export(JackRandom) +export(JackStraw) +export(JackStrawPlot) +export(KClustDimension) +export(KMeansHeatmap) +export(Kclust_dimension) +export(LogNormalize) +export(LogVMR) +export(MakeSparse) +export(MapCell) +export(MarkerTest) +export(MatrixRowShuffle) +export(MeanVarPlot) +export(MergeNode) +export(MergeSeurat) +export(MinMax) +export(NegBinomDETest) +export(NegBinomRegDETest) +export(NodeHeatmap) +export(NormalizeData) +export(NumberClusters) +export(OldDoHeatmap) +export(PCA) +export(PCAEmbed) +export(PCALoad) +export(PCAPlot) +export(PCASigGenes) +export(PCElbowPlot) +export(PCHeatmap) +export(PCTopCells) +export(PCTopGenes) +export(PlotClusterTree) +export(PoissonDETest) +export(PosteriorPlot) +export(PrintAlignSubspaceParams) +export(PrintCCAParams) +export(PrintCalcParams) +export(PrintCalcVarExpRatioParams) +export(PrintDMParams) +export(PrintDim) +export(PrintFindClustersParams) +export(PrintICA) +export(PrintICAParams) +export(PrintPCA) +export(PrintPCAParams) +export(PrintSNNParams) +export(PrintTSNEParams) +export(ProjectDim) +export(ProjectPCA) +export(Read10X) +export(RefinedMapping) +export(RemoveFromTable) +export(RenameIdent) +export(ReorderIdent) +export(RunCCA) +export(RunDiffusion) +export(RunICA) +export(RunPCA) +export(RunTSNE) +export(SampleUMI) +export(SaveClusters) +export(ScaleData) +export(ScaleDataR) +export(SetAllIdent) +export(SetAssayData) +export(SetClusters) +export(SetDimReduction) +export(SetIdent) +export(Shuffle) +export(SplitDotPlotGG) +export(StashIdent) +export(SubsetColumn) +export(SubsetData) +export(SubsetRow) +export(TSNEPlot) +export(TobitTest) +export(UpdateSeuratObject) +export(ValidateClusters) +export(ValidateSpecificClusters) +export(VariableGenePlot) +export(VizClassification) +export(VizDimReduction) +export(VizICA) +export(VizPCA) +export(VlnPlot) +export(WhichCells) +export(XCellCentroid) +export(YCellCentroid) +export(addImputedScore) +export(addMetaData) +export(addSmoothedScore) +export(add_samples) +export(average.expression) +export(average.pca) +export(batch.gene) +export(buildClusterTree) +export(bwCols) +export(calc.insitu) +export(calcNoiseModels) +export(calinskiPlot) +export(cell.cor.matrix) +export(cellPlot) +export(cluster.alpha) +export(diff.t.test) +export(diffExp.test) +export(dim.plot) +export(doHeatMap) +export(doKMeans) +export(dot.plot) +export(feature.heatmap) +export(feature.plot) +export(feature.plot.keynote) +export(fetch.data) +export(find.markers) +export(find.markers.node) +export(find_all_markers) +export(fit.gene.k) +export(fit.gene.mix) +export(gene.cor.matrix) +export(genePlot) +export(geneScorePlot) +export(genes.in.cluster) +export(get.centroids) +export(getNewScore) +export(icHeatmap) +export(icTopGenes) +export(ica) +export(ica.plot) +export(initial.mapping) +export(jackRandom) +export(jackStraw) +export(jackStraw.permutation.test) +export(jackStrawFull) +export(jackStrawMC) +export(jackStrawPlot) +export(kMeansHeatmap) +export(map.cell) +export(marker.test) +export(mean.var.plot) +export(minusc) +export(minusr) +export(pcHeatmap) +export(pcTopCells) +export(pcTopGenes) +export(pca) +export(pca.plot) +export(pca.sig.genes) +export(plotClusterTree) +export(plotNoiseModel) +export(posterior.plot) +export(print.pca) +export(project.pca) +export(project.samples) +export(pyCols) +export(refined.mapping) +export(regulatorScore) +export(removePC) +export(rename.ident) +export(reorder.ident) +export(run_diffusion) +export(run_tsne) +export(set.all.ident) +export(set.ident) +export(situ3d) +export(spatial.de) +export(subsetCells) +export(subsetData) +export(tobit.test) +export(tsne.plot) +export(viz.ica) +export(viz.pca) +export(vlnPlot) +export(which.cells) +export(writ.table) +export(zf.anchor.render) +export(zf.cells.render) +export(zf.insitu.dorsal) +export(zf.insitu.lateral) +export(zf.insitu.side) +export(zf.insitu.vec.lateral) +export(zf.insitu.ventral) +exportClasses(seurat) +exportMethods(show) +import(Matrix) +import(ROCR) +import(SDMTools) +import(VGAM) +import(diffusionMap) +import(fpc) +import(ggplot2) +import(gridExtra) +import(lars) +import(parallel) +import(pbapply) +import(stringr) +importFrom(FNN,get.knn) +importFrom(MASS,glm.nb) +importFrom(MASS,kde2d) +importFrom(MASS,negative.binomial) +importFrom(MASS,theta.ml) +importFrom(Matrix,colSums) +importFrom(Matrix,readMM) +importFrom(Matrix,rowSums) +importFrom(Matrix,sparseMatrix) +importFrom(RColorBrewer,brewer.pal.info) +importFrom(Rcpp,evalCpp) +importFrom(Rtsne,Rtsne) +importFrom(ape,as.phylo) +importFrom(ape,drop.tip) +importFrom(ape,nodelabels) +importFrom(ape,plot.phylo) +importFrom(caret,train) +importFrom(caret,trainControl) +importFrom(cowplot,plot_grid) +importFrom(dplyr,"%>%") +importFrom(dplyr,filter) +importFrom(dplyr,full_join) +importFrom(dplyr,group_by) +importFrom(dplyr,mutate) +importFrom(dplyr,mutate_each) +importFrom(dplyr,select) +importFrom(dplyr,summarize) +importFrom(dplyr,summarize_each) +importFrom(dplyr,top_n) +importFrom(dplyr,ungroup) +importFrom(dtw,dtw) +importFrom(gdata,drop.levels) +importFrom(gplots,heatmap.2) +importFrom(ica,icafast) +importFrom(ica,icaimax) +importFrom(ica,icajade) +importFrom(igraph,E) +importFrom(igraph,graph.adjacency) +importFrom(igraph,graph.adjlist) +importFrom(igraph,plot.igraph) +importFrom(irlba,irlba) +importFrom(mixtools,normalmixEM) +importFrom(pbapply,pbapply) +importFrom(pbapply,pblapply) +importFrom(pbapply,pbsapply) +importFrom(ranger,ranger) +importFrom(reshape2,melt) +importFrom(tclust,tkmeans) +importFrom(tidyr,gather) +importFrom(tsne,tsne) +useDynLib(Seurat) diff --git a/R/.Rapp.history b/R/.Rapp.history new file mode 100644 index 000000000..e69de29bb diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 000000000..245b082d5 --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,47 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +RunUMISampling <- function(data, sample_val, upsample = FALSE, display_progress = TRUE) { + .Call('_Seurat_RunUMISampling', PACKAGE = 'Seurat', data, sample_val, upsample, display_progress) +} + +RunUMISamplingPerCell <- function(data, sample_val, upsample = FALSE, display_progress = TRUE) { + .Call('_Seurat_RunUMISamplingPerCell', PACKAGE = 'Seurat', data, sample_val, upsample, display_progress) +} + +RowMergeMatrices <- function(mat1, mat2, mat1_rownames, mat2_rownames, all_rownames) { + .Call('_Seurat_RowMergeMatrices', PACKAGE = 'Seurat', mat1, mat2, mat1_rownames, mat2_rownames, all_rownames) +} + +LogNorm <- function(data, scale_factor, display_progress = TRUE) { + .Call('_Seurat_LogNorm', PACKAGE = 'Seurat', data, scale_factor, display_progress) +} + +FastMatMult <- function(m1, m2) { + .Call('_Seurat_FastMatMult', PACKAGE = 'Seurat', m1, m2) +} + +FastRowScale <- function(mat, scale = TRUE, center = TRUE, scale_max = 10, display_progress = TRUE) { + .Call('_Seurat_FastRowScale', PACKAGE = 'Seurat', mat, scale, center, scale_max, display_progress) +} + +Standardize <- function(mat, display_progress = TRUE) { + .Call('_Seurat_Standardize', PACKAGE = 'Seurat', mat, display_progress) +} + +FastSparseRowScale <- function(mat, scale = TRUE, center = TRUE, scale_max = 10, display_progress = TRUE) { + .Call('_Seurat_FastSparseRowScale', PACKAGE = 'Seurat', mat, scale, center, scale_max, display_progress) +} + +FastCov <- function(mat, center = TRUE) { + .Call('_Seurat_FastCov', PACKAGE = 'Seurat', mat, center) +} + +FastCovMats <- function(mat1, mat2, center = TRUE) { + .Call('_Seurat_FastCovMats', PACKAGE = 'Seurat', mat1, mat2, center) +} + +FastRBind <- function(mat1, mat2) { + .Call('_Seurat_FastRBind', PACKAGE = 'Seurat', mat1, mat2) +} + diff --git a/R/cluster_determination.R b/R/cluster_determination.R new file mode 100644 index 000000000..f3d5941bd --- /dev/null +++ b/R/cluster_determination.R @@ -0,0 +1,649 @@ +#' @include seurat.R +NULL +#' Cluster Determination +#' +#' Identify clusters of cells by a shared nearest neighbor (SNN) modularity +#' optimization based clustering algorithm. First calculate k-nearest neighbors +#' and construct the SNN graph. Then optimize the modularity function to +#' determine clusters. For a full description of the algorithms, see Waltman and +#' van Eck (2013) \emph{The European Physical Journal B}. +#' +#' @param object Seurat object +#' @param genes.use A vector of gene names to use in construction of SNN graph +#' if building directly based on expression data rather than a dimensionally +#' reduced representation (i.e. PCs). +#' @param reduction.type Name of dimensional reduction technique to use in +#' construction of SNN graph. (e.g. "pca", "ica") +#' @param dims.use A vector of the dimensions to use in construction of the SNN +#' graph (e.g. To use the first 10 PCs, pass 1:10) +#' @param k.param Defines k for the k-nearest neighbor algorithm +#' @param k.scale Granularity option for k.param +#' @param plot.SNN Plot the SNN graph +#' @param prune.SNN Sets the cutoff for acceptable Jaccard distances when +#' computing the neighborhood overlap for the SNN construction. Any edges with +#' values less than or equal to this will be set to 0 and removed from the SNN +#' graph. Essentially sets the strigency of pruning (0 --- no pruning, 1 --- +#' prune everything). +#' @param print.output Whether or not to print output to the console +#' @param distance.matrix Build SNN from distance matrix (experimental) +#' @param save.SNN Saves the SNN matrix associated with the calculation in +#' object@@snn +#' @param reuse.SNN Force utilization of stored SNN. If none store, this will +#' throw an error. +#' @param force.recalc Force recalculation of SNN. +#' @param modularity.fxn Modularity function (1 = standard; 2 = alternative). +#' @param resolution Value of the resolution parameter, use a value above +#' (below) 1.0 if you want to obtain a larger (smaller) number of communities. +#' @param algorithm Algorithm for modularity optimization (1 = original Louvain +#' algorithm; 2 = Louvain algorithm with multilevel refinement; 3 = SLM +#' algorithm). +#' @param n.start Number of random starts. +#' @param n.iter Maximal number of iterations per random start. +#' @param random.seed Seed of the random number generator. +#' @param temp.file.location Directory where intermediate files will be written. +#' Specify the ABSOLUTE path. +#' @importFrom FNN get.knn +#' @importFrom igraph plot.igraph graph.adjlist +#' @importFrom Matrix sparseMatrix +#' @return Returns a Seurat object and optionally the SNN matrix, +#' object@@ident has been updated with new cluster info +#' +#' @export +#' +FindClusters <- function( + object, + genes.use = NULL, + reduction.type = "pca", + dims.use = NULL, + k.param = 30, + k.scale = 25, + plot.SNN = FALSE, + prune.SNN = 1/15, + print.output = TRUE, + distance.matrix = NULL, + save.SNN = FALSE, + reuse.SNN = FALSE, + force.recalc = FALSE, + modularity.fxn = 1, + resolution = 0.8, + algorithm = 1, + n.start = 100, + n.iter = 10, + random.seed = 0, + temp.file.location = NULL +) { + snn.built <- FALSE + if (.hasSlot(object = object, name = "snn")) { + if (length(x = object@snn) > 1) { + snn.built <- TRUE + save.SNN <- TRUE + } + } + if (( + missing(x = genes.use) && missing(x = dims.use) && missing(x = k.param) && + missing(x = k.scale) && missing(x = prune.SNN) && snn.built + ) || reuse.SNN) { + save.SNN <- TRUE + if (reuse.SNN && !snn.built) { + stop("No SNN stored to reuse.") + } + if (reuse.SNN && ( + ! missing(x = genes.use) || ! missing(x = dims.use) || ! missing(x = k.param) + || ! missing(x = k.scale) || ! missing(x = prune.SNN) + )) { + warning("SNN was not be rebuilt with new parameters. Continued with stored + SNN. To suppress this warning, remove all SNN building parameters.") + } + } else { + # if any SNN building parameters are provided or it hasn't been built, build + # a new SNN + object <- BuildSNN( + object = object, + genes.use = genes.use, + reduction.type = reduction.type, + dims.use = dims.use, + k.param = k.param, + k.scale = k.scale, + plot.SNN = plot.SNN, + prune.SNN = prune.SNN, + print.output = print.output, + distance.matrix = distance.matrix, + force.recalc = force.recalc + ) + } + for (r in resolution) { + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("FindClusters"))] + parameters.to.store$resolution <- r + if (CalcInfoExists(object, paste0("FindClusters.res.", r)) & force.recalc != TRUE){ + parameters.to.store$object <- NULL + old.parameters <- GetAllCalcParam(object = object, + calculation = paste0("FindClusters.res.", r)) + old.parameters$time <- NULL + if(all(old.parameters %in% parameters.to.store)){ + warning(paste0("Clustering parameters for resolution ", r, " exactly match those of already computed. \n To force recalculation, set force.recalc to TRUE.")) + next + } + } + object <- SetCalcParams(object = object, + calculation = paste0("FindClusters.res.", r), + ... = parameters.to.store) + object <- RunModularityClustering( + object = object, + SNN = object@snn, + modularity = modularity.fxn, + resolution = r, + algorithm = algorithm, + n.start = n.start, + n.iter = n.iter, + random.seed = random.seed, + print.output = print.output, + temp.file.location = temp.file.location + + ) + object <- GroupSingletons(object = object, SNN = object@snn) + name <- paste0("res.", r) + object <- StashIdent(object = object, save.name = name) + } + if (!save.SNN) { + object@snn <- sparseMatrix(1, 1, x = 1) + object <- RemoveCalcParams(object = object, + calculation = "BuildSNN") + } + return(object) +} + +#' Get Cluster Assignments +#' +#' Retrieve cluster IDs as a dataframe. First column will be the cell name, +#' second column will be the current cluster identity (pulled from object@ident). + +#' @param object Seurat object with cluster assignments +#' @return Returns a dataframe with cell names and cluster assignments +#' @export +#' +GetClusters <- function(object) { + clusters <- data.frame(cell.name = names(object@ident), cluster = object@ident) + rownames(clusters) <- NULL + clusters$cell.name <- as.character(clusters$cell.name) + return(clusters) +} + +#' Set Cluster Assignments +#' +#' Easily set the cluster assignments using the output of GetClusters() --- +#' a dataframe with cell names as the first column and cluster assignments as +#' the second. +#' +#' @param object Seurat object +#' @param clusters A dataframe containing the cell names and cluster assignments +#' to set for the object. +#' @return Returns a Seurat object with the identities set to the cluster +#' assignments that were passed. +#' @export +#' +SetClusters <- function(object, clusters = NULL) { + if(!(all(c("cell.name", "cluster") %in% colnames(clusters)))){ + stop("The clusters parameter must be the output from GetClusters (i.e. + Columns must be cell.name and cluster)") + } + cells.use <- clusters$cell.name + ident.use <- clusters$cluster + object <- SetIdent( + object = object, + cells.use = cells.use, + ident.use = ident.use + ) + return(object) + } + +#' Save cluster assignments to a TSV file +#' +#' @param object Seurat object with cluster assignments +#' @param file Path to file to write cluster assignments to +#' @return No return value. Writes clusters assignments to specified file. +#' @export +#' +SaveClusters <- function(object, file) { + my.clusters <- GetClusters(object = object) + write.table(my.clusters, file = file, sep="\t", quote = FALSE, row.names = F) +} + +#' Convert the cluster labels to a numeric representation +#' +#' @param object Seurat object +#' @return Returns a Seurat object with the identities relabeled numerically +#' starting from 1. +#' +#' @export +NumberClusters <- function(object) { + clusters <- unique(x = object@ident) + if (typeof(x = clusters) == "integer") { + n <- as.numeric(x = max(clusters)) + 1 + for (i in clusters) { + object <- SetIdent( + object = object, + cells.use = WhichCells(object = object, ident = i), + ident.use = n + ) + n <- n + 1 + } + clusters <- unique(x = object@ident) + } + n <- 1 + for (i in clusters) { + object <- SetIdent( + object, + cells.use = WhichCells(object = object, ident = i), + ident.use = n + ) + n <- n + 1 + } + return(object) +} + +#' Classify New Data +#' +#' Classify new data based on the cluster information of the provided object. +#' Random Forests are used as the basis of the classification. +#' +#' @param object Seurat object on which to train the classifier +#' @param classifier Random Forest classifier from BuildRFClassifier. If not provided, +#' it will be built from the training data provided. +#' @param training.genes Vector of genes to build the classifier on +#' @param training.classes Vector of classes to build the classifier on +#' @param new.data New data to classify +#' @param ... additional parameters passed to ranger +#' +#' @return Vector of cluster ids +#' +#' @import Matrix +#' @importFrom ranger ranger +#' +#' @export +#' +ClassifyCells <- function( + object, + classifier, + training.genes = NULL, + training.classes = NULL, + new.data = NULL, + ... +) { + # build the classifier + if (missing(classifier)){ + classifier <- BuildRFClassifier( + object = object, + training.genes = training.genes, + training.classes = training.classes, + ... + ) + } + # run the classifier on the new data + features <- classifier$forest$independent.variable.names + genes.to.add <- setdiff(x = features, y = rownames(x = new.data)) + data.to.add <- matrix( + data = 0, + nrow = length(x = genes.to.add), + ncol = ncol(x = new.data) + ) + rownames(x = data.to.add) <- genes.to.add + new.data <- rbind(new.data, data.to.add) + new.data <- new.data[features, ] + new.data <- as.matrix(x = t(x = new.data)) + print("Running Classifier ...") + prediction <- predict(classifier, new.data) + new.classes <- prediction$predictions + return(new.classes) +} + +#' Build Random Forest Classifier +#' +#' Train the random forest classifier +#' +#' +#' @param object Seurat object on which to train the classifier +#' @param training.genes Vector of genes to build the classifier on +#' @param training.classes Vector of classes to build the classifier on +#' @param verbose Additional progress print statements +#' @param ... additional parameters passed to ranger +#' +#' @return Returns the random forest classifier +#' +#' @import Matrix +#' @importFrom ranger ranger +#' +#' @export +#' +BuildRFClassifier <- function( + object, + training.genes = NULL, + training.classes = NULL, + verbose = TRUE, + ... +) { + training.classes <- as.vector(x = training.classes) + training.genes <- SetIfNull( + x = training.genes, + default = rownames(x = object@data) + ) + training.data <- as.data.frame( + x = as.matrix( + x = t( + x = object@data[training.genes, ] + ) + ) + ) + training.data$class <- factor(x = training.classes) + if (verbose) { + print("Training Classifier ...") + } + classifier <- ranger( + data = training.data, + dependent.variable.name = "class", + classification = TRUE, + write.forest = TRUE, + ... + ) + return(classifier) +} + +#' K-Means Clustering +#' +#' Perform k=means clustering on both genes and single cells +#' +#' K-means and heatmap are calculated on object@@scale.data +#' +#' @param object Seurat object +#' @param genes.use Genes to use for clustering +#' @param k.genes K value to use for clustering genes +#' @param k.cells K value to use for clustering cells (default is NULL, cells +#' are not clustered) +#' @param k.seed Random seed +#' @param do.plot Draw heatmap of clustered genes/cells (default is FALSE). +#' @param data.cut Clip all z-scores to have an absolute value below this. +#' Reduces the effect of huge outliers in the data. +#' @param k.cols Color palette for heatmap +#' @param set.ident If clustering cells (so k.cells>0), set the cell identity +#' class to its K-means cluster (default is TRUE) +#' @param do.constrained FALSE by default. If TRUE, use the constrained K-means function implemented in the tclust package. +#' @param assay.type Type of data to normalize for (default is RNA), but can be changed for multimodal analyses. +#' @param \dots Additional parameters passed to kmeans (or tkmeans) +#' +#' @importFrom tclust tkmeans +#' +#' @return Seurat object where the k-means results for genes is stored in +#' +#' object@@kmeans.obj[[1]], and the k-means results for cells is stored in +#' object@@kmeans.col[[1]]. The cluster for each cell is stored in object@@meta.data[,"kmeans.ident"] +#' and also object@@ident (if set.ident=TRUE) +#' +#' @export +DoKMeans <- function( + object, + genes.use = NULL, + k.genes = NULL, + k.cells = 0, + k.seed = 1, + do.plot = FALSE, + data.cut = 2.5, + k.cols = pyCols, + set.ident = TRUE, + do.constrained = FALSE, + assay.type="RNA", + ... +) { + data.use.orig=GetAssayData(object,assay.type,slot = "scale.data") + data.use <- MinMax(data = data.use.orig, min = data.cut * (-1), max = data.cut) + genes.use <- SetIfNull(x = genes.use, default = object@var.genes) + genes.use <- genes.use[genes.use %in% rownames(x = data.use)] + cells.use <- object@cell.names + kmeans.data <- data.use[genes.use, cells.use] + if (do.constrained) { + set.seed(seed = k.seed) + kmeans.obj <- tkmeans(objectkmeans.data, k = k.genes, ...) + } else { + set.seed(seed = k.seed) + kmeans.obj <- kmeans(x = kmeans.data, centers = k.genes, ...) + } + + names(x = kmeans.obj$cluster) <- genes.use + + #if we are going to k-means cluster cells in addition to genes + if (k.cells > 0) { + kmeans.col <- kmeans(x = t(x = kmeans.data), centers = k.cells) + names(x = kmeans.col$cluster) <- cells.use + } + object.kmeans <- new( + Class = "kmeans.info", + gene.kmeans.obj = kmeans.obj, + cell.kmeans.obj = kmeans.col + ) + object@kmeans <- object.kmeans + if (k.cells > 0) { + kmeans.code=paste("kmeans",k.cells,"ident",sep=".") + object@meta.data[names(x = kmeans.col$cluster), kmeans.code] <- kmeans.col$cluster + } + if (set.ident && (k.cells > 0)) { + object <- SetIdent( + object = object, + cells.use = names(x = kmeans.col$cluster), + ident.use = kmeans.col$cluster + ) + } + if (do.plot) { + KMeansHeatmap(object = object) + } + return(object) +} + +#' Phylogenetic Analysis of Identity Classes +#' +#' Constructs a phylogenetic tree relating the 'average' cell from each +#' identity class. Tree is estimated based on a distance matrix constructed in +#' either gene expression space or PCA space. +#' +#' Note that the tree is calculated for an 'average' cell, so gene expression +#' or PC scores are averaged across all cells in an identity class before the +#' tree is constructed. +#' +#' @param object Seurat object +#' @param genes.use Genes to use for the analysis. Default is the set of +#' variable genes (object@@var.genes). Assumes pcs.use=NULL (tree calculated in +#' gene expression space) +#' @param pcs.use If set, tree is calculated in PCA space, using the +#' eigenvalue-WeightedEucleideanDist distance across these PC scores. +#' @param SNN.use If SNN is passed, build tree based on SNN graph connectivity between clusters +#' @param do.plot Plot the resulting phylogenetic tree +#' @param do.reorder Re-order identity classes (factor ordering), according to +#' position on the tree. This groups similar classes together which can be +#' helpful, for example, when drawing violin plots. +#' @param reorder.numeric Re-order identity classes according to position on +#' the tree, assigning a numeric value ('1' is the leftmost node) +#' +#' @return A Seurat object where the cluster tree is stored in +#' object@@cluster.tree[[1]] +#' +#' @importFrom ape as.phylo +#' +#' @export +#' +BuildClusterTree <- function( + object, + genes.use = NULL, + pcs.use = NULL, + SNN.use = NULL, + do.plot = TRUE, + do.reorder = FALSE, + reorder.numeric = FALSE +) { + genes.use <- SetIfNull(x = genes.use, default = object@var.genes) + ident.names <- as.character(x = unique(x = object@ident)) + if (! is.null(x = genes.use)) { + genes.use <- intersect(x = genes.use, y = rownames(x = object@data)) + data.avg <- AverageExpression(object = object, genes.use = genes.use) + data.dist <- dist(t(x = data.avg[genes.use, ])) + } + if (! is.null(x = pcs.use)) { + data.pca <- AveragePCA(object = object) + data.use <- data.pca[pcs.use,] + if (is.null(x = object@pca.obj[[1]]$sdev)) { + data.eigenval <- (object@pca.obj[[1]]$d) ^ 2 + } else { + data.eigenval <- (object@pca.obj[[1]]$sdev) ^ 2 + } + data.weights <- (data.eigenval / sum(data.eigenval))[pcs.use] + data.weights <- data.weights / sum(data.weights) + data.dist <- CustomDistance( + my.mat = data.pca[pcs.use, ], + my.function = WeightedEuclideanDist, + w = data.weights + ) + } + if (! is.null(x = SNN.use)) { + num.clusters <- length(x = ident.names) + data.dist = matrix(data = 0, nrow = num.clusters, ncol = num.clusters) + for (i in 1:(num.clusters - 1)) { + for (j in (i + 1):num.clusters) { + subSNN <- SNN.use[ + match( + x = WhichCells(object = object, ident = i), + table = colnames(x = SNN.use) + ), # Row + match( + x = WhichCells(object = object, ident = j), + table = rownames(x = SNN.use) + ) # Column + ] + d <- mean(subSNN) + if (is.na(x = d)) { + data.dist[i, j] <- 0 + } else { + data.dist[i, j] = d + } + } + } + diag(x = data.dist) <- 1 + data.dist <- dist(data.dist) + } + data.tree <- as.phylo(x = hclust(d = data.dist)) + object@cluster.tree[[1]] <- data.tree + if (do.reorder) { + old.ident.order <- sort(x = unique(x = object@ident)) + data.tree <- object@cluster.tree[[1]] + all.desc <- GetDescendants(tree = data.tree, node = (data.tree$Nnode + 2)) + all.desc <- old.ident.order[all.desc[all.desc <= (data.tree$Nnode + 1)]] + object@ident <- factor(x = object@ident, levels = all.desc, ordered = TRUE) + if (reorder.numeric) { + object <- SetIdent( + object = object, + cells.use = object@cell.names, + ident.use = as.integer(x = object@ident) + ) + object@meta.data[object@cell.names, "tree.ident"] <- as.integer(x = object@ident) + } + object <- BuildClusterTree( + object = object, + genes.use = genes.use, + pcs.use = pcs.use, + do.plot = FALSE, + do.reorder = FALSE + ) + } + if (do.plot) { + PlotClusterTree(object) + } + return(object) +} + + + +#' Perform spectral density clustering on single cells +#' +#' Find point clounds single cells in a two-dimensional space using density clustering (DBSCAN). +#' +#' @param object Seurat object +#' @param dim.1 First dimension to use +#' @param dim.2 second dimension to use +#' @param reduction.use Which dimensional reduction to use (either 'pca' or 'ica') +#' @param G.use Parameter for the density clustering. Lower value to get more fine-scale clustering +#' @param set.ident TRUE by default. Set identity class to the results of the density clustering. +#' Unassigned cells (cells that cannot be assigned a cluster) are placed in cluster 1, if there are any. +#' @param seed.use Random seed for the dbscan function +#' @param ... Additional arguments to be passed to the dbscan function +#' +#' @export +#' +DBClustDimension <- function( + object, + dim.1 = 1, + dim.2 = 2, + reduction.use = "tsne", + G.use = NULL, + set.ident = TRUE, + seed.use = 1, + ... +) { + dim.code <- GetDimReduction( + object = object, + reduction.type = reduction.use, + slot = 'key' + ) + dim.codes <- paste0(dim.code, c(dim.1, dim.2)) + data.plot <- FetchData(object = object, vars.all = dim.codes) + x1 <- paste0(dim.code, dim.1) + x2 <- paste0(dim.code, dim.2) + data.plot$x <- data.plot[, x1] + data.plot$y <- data.plot[, x2] + set.seed(seed = seed.use) + data.mclust <- ds <- dbscan(data = data.plot[, c("x", "y")], eps = G.use, ...) + to.set <- as.numeric(x = data.mclust$cluster + 1) + data.names <- names(x = object@ident) + object@meta.data[data.names, "DBclust.ident"] <- to.set + if (set.ident) { + object@ident <- factor(x = to.set) + names(x = object@ident) <- data.names + } + return(object) +} + + + + +#' Perform spectral k-means clustering on single cells +#' +#' Find point clounds single cells in a low-dimensional space using k-means clustering. +#' Can be useful for smaller datasets, where graph-based clustering can perform poorly +#' TODO : add documentation here +#' +#' @export +#' +KClustDimension <- function( + object, + dims.use = c(1,2), + cells.use = NULL, + pt.size = 4, + reduction.use = "tsne", + k.use = 5, + set.ident = T, + seed.use = 1, + ... +) { + dim.code <- GetDimReduction( + object = object, + reduction.type = reduction.use, + slot = 'key' + ) + dim.codes <- paste0(dim.code, dims.use) + data.plot <- FetchData(object = object, vars.all = dim.codes) + set.seed(seed = seed.use) + data.mclust <- ds <- kmeans(x = data.plot, centers = k.use) + to.set <- as.numeric(x = data.mclust$cluster) + data.names <- names(x = object@ident) + object@meta.data[data.names, "kdimension.ident"] <- to.set + if (set.ident) { + object@ident <- factor(x = to.set) + names(x = object@ident) <- data.names + } + return(object) +} + diff --git a/R/cluster_determination_internal.R b/R/cluster_determination_internal.R new file mode 100644 index 000000000..9bc1c8386 --- /dev/null +++ b/R/cluster_determination_internal.R @@ -0,0 +1,177 @@ +# Runs the modularity optimizer java program (ModularityOptimizer.jar) +# +# +# @param object Seurat object +# @param SNN SNN matrix to use as input for the clustering +# algorithms +# @param modularity Modularity function to use in clustering (1 = +# standard; 2 = alternative). +# @param resolution Value of the resolution parameter, use a value +# above (below) 1.0 if you want to obtain a larger +# (smaller) number of communities. +# @param algorithm Algorithm for modularity optimization (1 = +# original Louvain algorithm; 2 = Louvain algorithm +# with multilevel refinement; 3 = SLM algorithm) +# @param n.start Number of random starts. +# @param n.iter Maximal number of iterations per random start. +# @param random.seed Seed of the random number generator +# @param print.output Whether or not to print output to the console +# @param temp.file.location Directory where intermediate files will be written. +# @return Seurat object with identities set to the results +# of the clustering procedure. + +RunModularityClustering <- function( + object, + SNN = matrix(), + modularity = 1, + resolution = 0.8, + algorithm = 1, + n.start = 100, + n.iter = 10, + random.seed = 0, + print.output = TRUE, + temp.file.location = NULL +) { + seurat.dir <- system.file(package = "Seurat") + ModularityJarFile <- paste0(seurat.dir, "/java/ModularityOptimizer.jar") + seurat.dir.base <- strsplit(x = seurat.dir, split = "/")[[1]] + seurat.dir <- paste0( + seurat.dir.base[0:(length(x = seurat.dir.base) - 1)], + collapse = "/" + ) + seurat.dir <- paste0(seurat.dir, "/") + diag(x = SNN) <- 0 + if (is.object(x = SNN)) { + SNN <- as(object = SNN, Class = "dgTMatrix") + edge <- cbind(i = SNN@j, j = SNN@i, x = SNN@x) + } else { + swap <- which(x = SNN != 0, arr.ind = TRUE) - 1 + temp <- swap[, 1] + swap[, 1] <- swap[, 2] + swap[, 2] <- temp + edge <- cbind(swap, SNN[which(x = SNN != 0, arr.ind = TRUE)]) + } + rownames(x = edge) <- NULL + colnames(x = edge) <- NULL + edge <- edge[! duplicated(x = edge[, 1:2]), ] + temp.file.location <- SetIfNull(x = temp.file.location, default = seurat.dir) + unique_ID <- sample(x = 10000:99999, size = 1) + edge_file <- paste0(temp.file.location, "edge_", unique_ID, ".txt") + output_file <- paste0(temp.file.location, "output_", unique_ID, ".txt") + while (file.exists(edge_file)) { + unique_ID <- sample(x = 10000:99999, size = 1) + edge_file <- paste0(temp.file.location, "edge_", unique_ID, ".txt") + output_file <- paste0(temp.file.location, "output", unique_ID, ".txt") + } + if (print.output) { + print.output <- 1 + } else { + print.output <- 0 + } + write.table( + x = edge, + file = edge_file, + sep = "\t", + row.names = FALSE, + col.names = FALSE + ) + if (modularity == 2 && resolution > 1) { + stop("error: resolution<1 for alternative modularity") + } + command <- paste( + "java -jar", + shQuote(string = ModularityJarFile), + shQuote(string = edge_file), + shQuote(string = output_file), + modularity, + resolution, + algorithm, + n.start, + n.iter, + random.seed, + print.output + ) + system(command, wait = TRUE) + ident.use <- read.table(file = output_file, header = FALSE, sep = "\t")[, 1] + + object <- SetIdent( + object = object, + cells.use = object@cell.names, + ident.use = ident.use + ) + file.remove(edge_file) + file.remove(output_file) + return (object) +} + +# Group single cells that make up their own cluster in with the cluster they are +# most connected to. +# +# @param object Seurat object +# @param SNN SNN graph used in clustering +# @return Returns Seurat object with all singletons merged with most +# connected cluster + +GroupSingletons <- function(object, SNN) { + # identify singletons + singletons <- c() + for (cluster in unique(x = object@ident)) { + if (length(x = WhichCells(object = object, ident = cluster)) == 1) { + singletons <- append(x = singletons, values = cluster) + } + } + # calculate connectivity of singletons to other clusters, add singleton + # to cluster it is most connected to + cluster_names <- unique(x = object@ident) + cluster_names <- setdiff(x = cluster_names, y = singletons) + connectivity <- vector(mode="numeric", length = length(x = cluster_names)) + names(x = connectivity) <- cluster_names + for (i in singletons) { + for (j in cluster_names) { + subSNN = SNN[ + WhichCells(object = object, ident = i), # Row + match( + x = WhichCells(object = object, ident = j), + table = colnames(x = SNN) + ) + ] + if (is.object(x = subSNN)) { + connectivity[j] <- sum(subSNN) / (nrow(x = subSNN) * ncol(x = subSNN)) + } else { + connectivity[j] <- mean(x = subSNN) + } + } + m <- max(connectivity, na.rm = T) + mi <- which(x = connectivity == m, arr.ind = TRUE) + closest_cluster <- sample(x = names(x = connectivity[mi]), 1) + object <- SetIdent( + object = object, + cells.use = WhichCells(object = object, ident = i), + ident.use = closest_cluster + ) + } + if (length(x = singletons) > 0) { + print(paste( + length(x = singletons), + "singletons identified.", + length(x = unique(object@ident)), + "final clusters." + )) + } + return(object) +} + + +# Set up kmeans class +# This is an infrequently used slot, but some people still find it very useful to do kmeans clustering +# and in particular, to do so at the gene level +# potential to be updated in the future + +kmeans.info <- setClass( + Class = "kmeans.info", + slots = list( + gene.kmeans.obj = "ANY", + cell.kmeans.obj = "ANY" + ) +) + diff --git a/R/cluster_validation.R b/R/cluster_validation.R new file mode 100644 index 000000000..0d6b4f5b7 --- /dev/null +++ b/R/cluster_validation.R @@ -0,0 +1,319 @@ +#' @include seurat.R +NULL +#' Cluster Validation +#' +#' Methods for validating the legitimacy of clusters using classification. SVMs +#' are used as the basis for the classification. Merging is done based on the +#' connectivity from an SNN graph. +#' +#' @param object Seurat object +#' @param pc.use Which PCs to use to define genes in model construction +#' @param top.genes Use the top X genes for each PC in model construction +#' @param min.connectivity Threshold of connectedness for comparison of two +#' clusters +#' @param acc.cutoff Accuracy cutoff for classifier +#' @param verbose Controls whether to display progress and merging results +#' @importFrom caret trainControl train +#' @return Returns a Seurat object, object@@ident has been updated with new +#' cluster info +#' @export +ValidateClusters <- function( + object, + pc.use = NULL, + top.genes = 30, + min.connectivity = 0.01, + acc.cutoff = 0.9, + verbose = TRUE +) { + # probably should refactor to make cleaner + if (length(x = object@snn) > 1) { + SNN.use <- object@snn + } else { + stop("SNN matrix required. Please run BuildSNN() to save the SNN matrix in + the object slot") + } + if (is.null(pc.use)){ + stop("pc.use not set. Please choose PCs.") + } + num.clusters.orig <- length(x = unique(x = object@ident)) + still_merging <- TRUE + if (verbose) { + connectivity <- CalcConnectivity(object = object) + end <- length(x = connectivity[connectivity > min.connectivity]) + progress <- end + status <- 0 + } + # find connectedness of every two clusters + while (still_merging) { + connectivity <- CalcConnectivity(object = object) + merge.done <- FALSE + while (! merge.done) { + m <- max(connectivity, na.rm = TRUE) + mi <- which(x = connectivity == m, arr.ind = TRUE) + c1 <- rownames(x = connectivity)[mi[, 1]] + c2 <- rownames(x = connectivity)[mi[, 2]] + if (m > min.connectivity) { + acc <- RunClassifier( + object = object, + group1 = c1, + group2 = c2, + pcs = pc.use, + num.genes = top.genes + ) + # if classifier can't classify them well enough, merge clusters + if (acc < acc.cutoff) { + object <- SetIdent( + object = object, + cells.use = WhichCells(object = object, ident = c1), + ident.use = c2 + ) + if (verbose) { + progress <- length(x = connectivity[connectivity > min.connectivity]) + print(paste0( + sprintf("%3.0f", (1 - progress / end) * 100), + "% complete --- merge clusters ", + c1, + " and ", + c2, + ", classification accuracy of ", + sprintf("%1.4f", acc) + )) + } + merge.done <- TRUE + } else { + if (verbose & status == 5) { + print(paste0( + sprintf("%3.0f", (1 - progress / end) * 100), + "% complete --- Last 5 cluster comparisons failed to merge, ", + "still checking possible merges ..." + )) + status <- 0 + } + status <- status + 1 + connectivity[c1, c2] <- 0 + connectivity[c2, c1] <- 0 + } + } else { + still_merging <- FALSE + break + } + } + } + if (verbose) { + print(paste0( + "100% complete --- started with ", + num.clusters.orig, + " clusters, ", + length(x = unique(x = object@ident)), + " clusters remaining" + )) + } + return(object) +} + +#' Specific Cluster Validation +#' +#' Methods for validating the legitimacy of two specific clusters using +#' classification. SVMs are used as the basis for the classification. +#' Merging is done based on the connectivity from an SNN graph. +#' +#' @param object Seurat object +#' @param cluster1 First cluster to check classification +#' @param cluster2 Second cluster to check with classification +#' @param pc.use Which PCs to use for model construction +#' @param top.genes Use the top X genes for model construction +#' @param acc.cutoff Accuracy cutoff for classifier +#' @importFrom caret trainControl train +#' @return Returns a Seurat object, object@@ident has been updated with +#' new cluster info +#' @export +ValidateSpecificClusters <- function( + object, + cluster1 = NULL, + cluster2 = 1, + pc.use = 2, + top.genes = 30, + acc.cutoff = 0.9 +) { + acc <- RunClassifier( + object = object, + group1 = cluster1, + group2 = cluster2, + pcs = pc.use, + num.genes = top.genes + ) + print(paste0( + "Comparing cluster ", + cluster1, + " and ", + cluster2, + ": Acc = ", + acc + )) + if (acc < acc.cutoff) { + object <- SetIdent( + object = object, + cells.use = WhichCells(object = object, ident = cluster1), + ident.use = cluster2 + ) + print(paste("merge cluster", cluster1, "and", cluster2)) + merge.done <- TRUE + } + return(object) +} + +# Train an SVM classifier and return the accuracy after 5 fold CV +# +# @param object Seurat object +# @param group1 One identity to train classifier on +# @param group2 Second identity to train classifier on +# @param pcs Vector of PCs on which to base genes to train classifier on. +# Pulls top num.genes genes associated with these PCs +# @param num.genes Number of genes to pull for each PC +# @return Returns the accuracy of the classifier after CV + +RunClassifier <- function(object, group1, group2, pcs, num.genes) { + d1 <- WhichCells(object = object, ident = group1) + d2 <- WhichCells(object = object, ident = group2) + y <- as.numeric(x = object@ident[c(d1, d2)]) - 1 + x <- data.frame(as.matrix(t( + x = object@data[PCTopGenes(object = object, pc.use = pcs, num.genes = + num.genes), c(d1, d2)] + ))) + xv <- apply(X = x, MARGIN = 2, FUN = var) + x <- x[, names(x = which(xv > 0))] + # run k-fold cross validation + ctrl <- trainControl(method = "repeatedcv", repeats = 5) + set.seed(seed = 1500) + model <- train( + x = x, + y = as.factor(x = y), + formula = as.factor(x = y) ~ ., + method = "svmLinear", + trControl = ctrl + ) + acc <- model$results[, 2] + return(acc) +} + +#' Assess Internal Nodes +#' +#' Method for automating assessment of tree splits over all internal nodes, +#' or a provided list of internal nodes. Uses AssessSplit() for calculation +#' of Out of Bag error (proxy for confidence in split). +#' +#' @param object Seurat object +#' @param node.list List of internal nodes to assess and return +#' @param all.below If single node provided in node.list, assess all splits below (and including) +#' provided node +#' . +#' @return Returns the Out of Bag error for a random forest classifiers trained on +#' each internal node split or each split provided in the node list. +#' +#' @export +AssessNodes <- function(object, node.list, all.below = FALSE) { + tree <- object@cluster.tree[[1]] + if (missing(x = node.list)) { + node.list <- GetAllInternalNodes(tree = tree) + } else { + possible.nodes <- GetAllInternalNodes(tree = tree) + if (any(! node.list %in% possible.nodes)){ + stop(paste( + node.list[!(node.list %in% possible.nodes)], + "not valid internal nodes" + )) + } + if (length(x = node.list == 1) && all.below) { + node.list <- c(node.list, DFT(tree = tree, node = node.list)) + } + } + oobe <- pbsapply( + X = node.list, + FUN = function(x) { + return(AssessSplit( + object = object, + node = x, + print.output = FALSE, + verbose = FALSE + )) + } + ) + return(data.frame(node = node.list, oobe)) +} + +#' Assess Cluster Split +#' +#' Method for determining confidence in specific bifurcations in +#' the cluster tree. Use the Out of Bag (OOB) error of a random +#' forest classifier to judge confidence. +#' +#' @param object Seurat object +#' @param node Node in the cluster tree in question +#' @param cluster1 First cluster to compare +#' @param cluster2 Second cluster to compare +#' @param print.output Print the OOB error for the classifier +#' @inheritDotParams BuildRFClassifier -object +#' @return Returns the Out of Bag error for a random forest classifier +#' trained on the split from the given node +#' @export +AssessSplit <- function( + object, + node, + cluster1, + cluster2, + print.output = TRUE, + ... +) { + tree <- object@cluster.tree[[1]] + if (! missing(x = node)){ + if (! missing(x = cluster1) || ! missing(x = cluster2)) { + warning("Both node and cluster IDs provided. Defaulting to using node ID") + } + possible.nodes <- c( + DFT(tree = tree, node = tree$edge[,1][1]), + tree$edge[,1][1] + ) + if (! node %in% possible.nodes) { + stop("Not a valid node") + } + split <- tree$edge[which(x = tree$edge[,1] == node), ][,2] + group1 <- DFT(tree = tree, node = split[1], only.children = TRUE) + group2 <- DFT(tree = tree, node = split[2], only.children = TRUE) + if (any(is.na(x = group1))) { + group1 <- split[1] + } + if (any(is.na(x = group2))) { + group2 <- split[2] + } + } else { + group1 <- cluster1 + group2 <- cluster2 + } + group1.cells <- WhichCells(object = object, ident = group1) + group2.cells <- WhichCells(object = object, ident = group2) + assess.data <- SubsetData( + object = object, + cells.use = c(group1.cells, group2.cells) + ) + assess.data <- SetIdent( + object = assess.data, + cells.use = group1.cells, + ident.use = "g1" + ) + assess.data <- SetIdent( + object = assess.data, + cells.use = group2.cells, + ident.use = "g2" + ) + rfc <- BuildRFClassifier( + object = assess.data, + training.genes = assess.data@var.genes, + training.classes = assess.data@ident, + ... + ) + oobe <- rfc$prediction.error + if (print.output) { + print(paste0("Out of Bag Error: ", round(x = oobe, digits = 4) * 100, "%")) + } + return(oobe) +} diff --git a/R/deprecated_functions.R b/R/deprecated_functions.R new file mode 100644 index 000000000..5da80aeec --- /dev/null +++ b/R/deprecated_functions.R @@ -0,0 +1,573 @@ +#' Deprecated function(s) in the Seurat package +#' +#' These functions are provided for compatibility with older version of the Seurat package. They may eventually be completely removed. +#' @rdname Seurat-deprecated +#' @name Seurat-deprecated +#' @param ... Parameters to be passed to the modern version of the function +#' @export vlnPlot subsetData mean.var.plot pca PCA project.pca print.pca viz.pca set.ident pca.plot pcHeatmap jackStraw jackStrawPlot run_tsne tsne.plot find.markers find_all_markers genePlot feature.plot tsne.plot buildClusterTree plotClusterTree plotNoiseModel add_samples subsetCells project.samples run_diffusion ica ICA cluster.alpha reorder.ident average.pca average.expression icTopGenes pcTopGenes pcTopCells fetch.data viz.ica regulatorScore find.markers.node diffExp.test tobit.test batch.gene marker.test diff.t.test which.cells set.all.ident rename.ident posterior.plot map.cell get.centroids refined.mapping initial.mapping calc.insitu fit.gene.k fit.gene.mix addSmoothedScore addImputedScore getNewScore calcNoiseModels feature.plot.keynote feature.heatmap ica.plot dim.plot spatial.de DBclust_dimension Kclust_dimension pca.sig.genes doHeatMap icHeatmap doKMeans genes.in.cluster kMeansHeatmap cell.cor.matrix gene.cor.matrix calinskiPlot dot.plot addMetaData removePC geneScorePlot cellPlot jackStraw.permutation.test jackStrawMC jackStrawFull writ.table jackRandom MeanVarPlot HeatmapNode minusr minusc +#' @aliases vlnPlot subsetData mean.var.plot pca PCA project.pca print.pca viz.pca set.ident pca.plot pcHeatmap jackStraw jackStrawPlot run_tsne tsne.plot find.markers find_all_markers genePlot feature.plot tnse.plot buildClusterTree plotClusterTree plotNoiseModel add_samples subsetCells project.samples run_diffusion ica ICA cluster.alpha reorder.ident average.pca average.expression icTopGenes pcTopGenes pcTopCells fetch.data viz.ica regulatorScore find.markers.node diffExp.test tobit.test batch.gene marker.test diff.t.test which.cells set.all.ident rename.ident posterior.plot map.cell get.centroids refined.mapping initial.mapping calc.insitu fit.gene.k fit.gene.mix addSmoothedScore addImputedScore getNewScore calcNoiseModels feature.plot.keynote feature.heatmap ica.plot dim.plot spatial.de DBclust_dimension Kclust_dimension pca.sig.genes doHeatMap icHeatmap doKMeans genes.in.cluster kMeansHeatmap cell.cor.matrix gene.cor.matrix calinskiPlot dot.plot addMetaData removePC geneScorePlot cellPlot jackStraw.permutation.test jackStrawMC jackStrawFull writ.table jackRandom MeanVarPlot HeatmapNode minusr minusc +#' @section Details: +#' \tabular{rl}{ +#' \code{vlnPlot} \tab now a synonym for \code{\link{VlnPlot}}\cr +#' \code{subsetData} \tab now a synonym for \code{\link{SubsetData}}\cr +#' \code{mean.var.plot} \tab now a synonym for \code{\link{MeanVarPlot}}\cr +#' \code{pca} \tab now a synonym for \code{\link{RunPCA}}\cr +#' \code{PCA} \tab now a synonym for \code{\link{PCA}}\cr +#' \code{project.pca} \tab now a synonym for \code{\link{ProjectPCA}}\cr +#' \code{print.pca} \tab now a synonym for \code{\link{PrintPCA}}\cr +#' \code{viz.pca} \tab now a synonym for \code{\link{VizPCA}}\cr +#' \code{set.ident} \tab now a synonym for \code{\link{SetIdent}}\cr +#' \code{pca.plot} \tab now a synonym for \code{\link{PCAPlot}}\cr +#' \code{pcHeatmap} \tab now a synonym for \code{\link{PCHeatmap}}\cr +#' \code{jackStraw} \tab now a synonym for \code{\link{JackStraw}}\cr +#' \code{jackStrawPlot} \tab now a synonym for \code{\link{JackStrawPlot}}\cr +#' \code{run_tsne} \tab now a synonym for \code{\link{RunTSNE}}\cr +#' \code{tsne.plot} \tab now a synonym for \code{\link{TSNEPlot}}\cr +#' \code{find.markers} \tab now a synonym for \code{\link{FindMarkers}}\cr +#' \code{find_all_markers} \tab now a synonym for \code{\link{FindAllMarkers}}\cr +#' \code{genePlot} \tab now a synonym for \code{\link{GenePlot}}\cr +#' \code{feature.plot} \tab now a synonym for \code{\link{FeaturePlot}}\cr +#' \code{buildClusterTree} \tab now a synonym for \code{\link{BuildClusterTree}}\cr +#' \code{plotClusterTree} \tab now a synonym for \code{\link{PlotClusterTree}}\cr +#' \code{plotNoiseModel} \tab now a synonym for \code{\link{PlotNoiseModel}}\cr +#' \code{add_samples} \tab now a synonym for \code{\link{AddSamples}}\cr +#' \code{subsetCells} \tab now deleted\cr +#' \code{project.samples} \tab now a synonym for \code{\link{ProjectSamples}}\cr +#' \code{run_diffusion} \tab now a synonym for \code{\link{RunDiffusion}}\cr +#' \code{ica} \tab now a synonym for \code{\link{RunICA}}\cr +#' \code{ICA} \tab now a synonym for \code{\link{RunICA}}\cr +#' \code{cluster.alpha} \tab now a synonym for \code{\link{AverageDetectionRate}}\cr +#' \code{reorder.ident} \tab now a synonym for \code{\link{ReorderIdent}}\cr +#' \code{average.pca} \tab now a synonym for \code{\link{AveragePCA}}\cr +#' \code{average.expression} \tab now a synonym for \code{\link{AverageExpression}}\cr +#' \code{icTopGenes} \tab now a synonym for \code{\link{ICTopGenes}}\cr +#' \code{pcTopGenes} \tab now a synonym for \code{\link{PCTopGenes}}\cr +#' \code{pcTopCells} \tab now a synonym for \code{\link{PCTopCells}}\cr +#' \code{fetch.data} \tab now a synonym for \code{\link{FetchData}}\cr +#' \code{viz.ica} \tab now a synonym for \code{\link{VizIca}}\cr +#' \code{regulatorScore} \tab now deleted\cr +#' \code{find.markers.node} \tab now a synonym for \code{\link{FindMarkersNode}}\cr +#' \code{diffExp.test} \tab now a synonym for \code{\link{DiffExpTest}}\cr +#' \code{tobit.test} \tab now a synonym for \code{\link{TobitTest}}\cr +#' \code{batch.gene} \tab now a synonym for \code{\link{BatchGene}}\cr +#' \code{marker.test} \tab now a synonym for \code{\link{MarkerTest}}\cr +#' \code{diff.t.test} \tab now a synonym for \code{\link{DiffTTest}}\cr +#' \code{which.cells} \tab now a synonym for \code{\link{WhichCells}}\cr +#' \code{set.all.ident} \tab now a synonym for \code{\link{SetAllIdent}}\cr +#' \code{rename.ident} \tab now a synonym for \code{\link{RenameIdent}}\cr +#' \code{posterior.plot} \tab now a synonym for \code{\link{PosteriorPlot}}\cr +#' \code{map.cell} \tab now a synonym for \code{\link{MapCell}}\cr +#' \code{get.centroids} \tab now a synonym for \code{\link{GetCentroids}}\cr +#' \code{refined.mapping} \tab now a synonym for \code{\link{RefinedMapping}}\cr +#' \code{initial.mapping} \tab now a synonym for \code{\link{InitialMapping}}\cr +#' \code{calc.insitu} \tab now a synonym for \code{\link{CalcInsitu}}\cr +#' \code{fit.gene.k} \tab now a synonym for \code{\link{FitGeneK}}\cr +#' \code{fit.gene.mix} \tab now a synonym for \code{\link{FitGeneMix}}\cr +#' \code{addSmoothedScore} \tab now a synonym for \code{\link{AddSmoothedScore}}\cr +#' \code{addImputedScore} \tab now a synonym for \code{\link{AddImputedScore}}\cr +#' \code{getNewScore} \tab now a synonym for \code{\link{GetNewScore}}\cr +#' \code{calcNoiseModels} \tab now a synonym for \code{\link{CalcNoiseModels}}\cr +#' \code{feature.plot.keynote} \tab now a synonym for \code{\link{FeaturePlotKeynote}}\cr +#' \code{feature.heatmap} \tab now a synonym for \code{\link{FeatureHeatmap}}\cr +#' \code{ica.plot} \tab now a synonym for \code{\link{ICAPlot}}\cr +#' \code{dim.plot} \tab now a synonym for \code{\link{DimPlot}}\cr +#' \code{spatial.de} \tab now a synonym for \code{\link{SpatialDe}}\cr +#' \code{DBclust_dimension} \tab now a synonym for \code{\link{DBClustDimension}}\cr +#' \code{Kclust_dimension} \tab now a synonym for \code{\link{KClustDimension}}\cr +#' \code{pca.sig.genes} \tab now a synonym for \code{\link{PCASigGenes}}\cr +#' \code{doHeatMap} \tab now a synonym for \code{\link{DoHeatMap}}\cr +#' \code{icHeatmap} \tab now a synonym for \code{\link{ICHeatmap}}\cr +#' \code{doKMeans} \tab now a synonym for \code{\link{DoKMeans}}\cr +#' \code{genes.in.cluster} \tab now a synonym for \code{\link{GenesInCluster}}\cr +#' \code{kMeansHeatmap} \tab now a synonym for \code{\link{KMeansHeatmap}}\cr +#' \code{cell.cor.matrix} \tab now a synonym for \code{\link{CellCorMatrix}}\cr +#' \code{gene.cor.matrix} \tab now a synonym for \code{\link{GeneCorMatrix}}\cr +#' \code{calinskiPlot} \tab now a synonym for \code{\link{CalinskiPlot}}\cr +#' \code{dot.plot} \tab now a synonym for \code{\link{DotPlot}}\cr +#' \code{addMetaData} \tab now a synonym for \code{\link{AddMetaData}}\cr +#' \code{removePC} \tab now a synonym for \code{\link{RemovePC}}\cr +#' \code{geneScorePlot} \tab now deleted\cr +#' \code{cellPlot} \tab now a synonym for \code{\link{CellPlot}}\cr +#' \code{jackStraw.permutation.test} \tab now a synonym for \code{\link{JackStrawPermutationTest}}\cr +#' \code{jackStrawMC} \tab now a synonym for \code{\link{JackStrawMC}}\cr +#' \code{jackStrawFull} \tab now a synonym for \code{\link{JackStrawFull}}\cr +#' \code{PCAFast} \tab now a synonym for \code{\link{PCA}}\cr +#' \code{writ.table} \tab is delteded without replacement\cr +#' \code{jackRandom} \tab now a synonym for \code{\link{JackRandom}}\cr +#' \code{MeanVarPlot} \tab now a synonym for \code{\link{FindVariableGenes}}\cr +#' \code{myPalette} \tab now a synonym for \code{\link{CustomPalette}}\cr +#' \code{minusr} \tab now a synonym for \code{\link{SubsetRow}}\cr +#' \code{minusc} \tab now a synonym for \code{\link{SubsetColumn}}\cr +#' } +#' +vlnPlot <- function(...) { + .Deprecated("VlnPlot", package="Seurat") + VlnPlot(...) +} + +subsetData <- function(...) { + .Deprecated("SubsetData", package="Seurat") + SubsetData(...) +} + +mean.var.plot <- function(...) { + .Deprecated("MeanVarPlot", package="Seurat") + MeanVarPlot(...) +} + +pca <- function(...) { + .Deprecated("RunPCA", package="Seurat") + RunPCA(...) +} + +PCA <- function(...) { + .Deprecated("RunPCA", package="Seurat") + RunPCA(...) +} + +project.pca <- function(...) { + .Deprecated("ProjectPCA", package="Seurat") + ProjectPCA(...) +} + +print.pca <- function(...) { + .Deprecated("PrintPCA", package="Seurat") + PrintPCA(...) +} + +viz.pca <- function(...) { + .Deprecated("VizPCA", package="Seurat") + VizPCA(...) +} + +set.ident <- function(...) { + .Deprecated("SetIdent", package="Seurat") + SetIdent(...) +} + +pca.plot <- function(...) { + .Deprecated("PCAPlot", package="Seurat") + PCAPlot(...) +} + +pcHeatmap <- function(...) { + .Deprecated("PCHeatmap", package="Seurat") + PCHeatmap(...) +} + +jackStraw <- function(...) { + .Deprecated("JackStraw", package="Seurat") + JackStraw(...) +} + +jackStrawPlot <- function(...) { + .Deprecated("JackStrawPlot", package="Seurat") + JackStrawPlot(...) +} + +run_tsne <- function(...) { + .Deprecated("RunTSNE", package="Seurat") + RunTSNE(...) +} + +tsne.plot <- function(...) { + .Deprecated("TSNEPlot", package="Seurat") + TSNEPlot(...) +} + +find.markers <- function(...) { + .Deprecated("FindMarkers", package="Seurat") + FindMarkers(...) +} + +find_all_markers <- function(...) { + .Deprecated("FindAllMarkers", package="Seurat") + FindAllMarkers(...) +} + +genePlot <- function(...) { + .Deprecated("GenePlot", package="Seurat") + GenePlot(...) +} + +feature.plot <- function(...) { + .Deprecated("FeaturePlot", package="Seurat") + FeaturePlot(...) +} + +buildClusterTree <- function(...) { + .Deprecated("BuildClusterTree", package="Seurat") + BuildClusterTree(...) +} + +plotClusterTree <- function(...) { + .Deprecated("PlotClusterTree", package="Seurat") + PlotClusterTree(...) +} + +plotNoiseModel <- function(...) { + .Deprecated("PlotNoiseModel", package="Seurat") + PlotNoiseModel(...) +} + +add_samples <- function(...) { + .Deprecated("AddSamples", package="Seurat") + AddSamples(...) +} + +subsetCells <- function(...) { + .Deprecated( + package = "Seurat", + msg = 'subsetCells is now deleted, please use SubsetData' + ) +} + +project.samples <- function(...) { + .Deprecated("ProjectSamples", package="Seurat") + ProjectSamples(...) +} + +run_diffusion <- function(...) { + .Deprecated("RunDiffusion", package="Seurat") + RunDiffusion(...) +} + +ica <- function(...) { + .Deprecated("RunICA", package="Seurat") + RunICA(...) +} + +ICA <- function(...) { + .Deprecated("RunICA", package="Seurat") + RunICA(...) +} + +cluster.alpha <- function(...) { + .Deprecated("AverageDetectionRate", package="Seurat") + AverageDetectionRate(...) +} + +reorder.ident <- function(...) { + .Deprecated("ReorderIdent", package="Seurat") + ReorderIdent(...) +} + +average.pca <- function(...) { + .Deprecated("AveragePCA", package="Seurat") + AveragePCA(...) +} + +average.expression <- function(...) { + .Deprecated("AverageExpression", package="Seurat") + AverageExpression(...) +} + +icTopGenes <- function(...) { + .Deprecated("ICTopGenes", package="Seurat") + ICTopGenes(...) +} + +pcTopGenes <- function(...) { + .Deprecated("PCTopGenes", package="Seurat") + PCTopGenes(...) +} +pcTopCells <- function(...) { + .Deprecated("PCTopCells", package="Seurat") + PCTopCells(...) +} + +fetch.data <- function(...) { + .Deprecated("FetchData", package="Seurat") + FetchData(...) +} + +viz.ica <- function(...) { + .Deprecated("VizICA", package="Seurat") + VizICA(...) +} + +regulatorScore <- function(...) { + .Deprecated( + package = "Seurat", + msg = 'regulatorScore has been deleted without replacement' + ) +} + +find.markers.node <- function(...) { + .Deprecated("FindMarkersNode", package="Seurat") + FindMarkersNode(...) +} + +diffExp.test <- function(...) { + .Deprecated("DiffExpTest", package="Seurat") + DiffExpTest(...) +} + +tobit.test <- function(...) { + .Deprecated("TobitTest", package="Seurat") + TobitTest(...) +} + +batch.gene <- function(...) { + .Deprecated("BatchGene", package="Seurat") + BatchGene(...) +} + +marker.test <- function(...) { + .Deprecated("MarkerTest ", package="Seurat") + MarkerTest(...) +} + +diff.t.test <- function(...) { + .Deprecated("DiffTTest", package="Seurat") + DiffTTest(...) +} + +which.cells <- function(...) { + .Deprecated("WhichCells", package="Seurat") + WhichCells(...) +} + +set.all.ident <- function(...) { + .Deprecated("SetAllIdent", package="Seurat") + SetAllIdent(...) +} + +rename.ident <- function(...) { + .Deprecated("RenameIdent", package="Seurat") + RenameIdent(...) +} +posterior.plot <- function(...) { + .Deprecated("PosteriorPlot", package="Seurat") + PosteriorPlot(...) +} + +map.cell <- function(...) { + .Deprecated("MapCell", package="Seurat") + MapCell(...) +} + +get.centroids <- function(...) { + .Deprecated("GetCentroids", package="Seurat") + GetCentroids(...) +} + +refined.mapping <- function(...) { + .Deprecated("RefinedMapping", package="Seurat") + RefinedMapping(...) +} + +initial.mapping <- function(...) { + .Deprecated("InitialMapping", package="Seurat") + InitialMapping(...) +} + +calc.insitu <- function(...) { + .Deprecated("CalcInsitu ", package="Seurat") + CalcInsitu(...) +} + +fit.gene.k <- function(...) { + .Deprecated("FitGeneK ", package="Seurat") + FitGeneK(...) +} + +fit.gene.mix <- function(...) { + .Deprecated("FitGeneMix ", package="Seurat") + FitGeneMix(...) +} + +addSmoothedScore <- function(...) { + .Deprecated("AddSmoothedScore ", package="Seurat") + AddSmoothedScore(...) +} + +addImputedScore <- function(...) { + .Deprecated("AddImputedScore ", package="Seurat") + AddImputedScore(...) +} + +getNewScore <- function(...) { + .Deprecated("GetNewScore ", package="Seurat") + GetNewScore(...) +} + +calcNoiseModels <- function(...) { + .Deprecated("CalcNoiseModels ", package="Seurat") + CalcNoiseModels(...) +} + +feature.plot.keynote <- function(...) { + .Deprecated("FeaturePlotKeynote ", package="Seurat") + FeaturePlotKeynote(...) +} + +feature.heatmap <- function(...) { + .Deprecated("FeatureHeatmap ", package="Seurat") + FeatureHeatmap(...) +} + +ica.plot <- function(...) { + .Deprecated("ICAPlot ", package="Seurat") + ICAPlot(...) +} + +dim.plot <- function(...) { + .Deprecated("DimPlot ", package="Seurat") + DimPlot(...) +} + +spatial.de <- function(...) { + .Deprecated("SpatialDe ", package="Seurat") + SpatialDe(...) +} + +DBclust_dimension <- function(...) { + .Deprecated("DBClustDimension ", package="Seurat") + DBClustDimension(...) +} + +Kclust_dimension <- function(...) { + .Deprecated("KClustDimension ", package="Seurat") + KClustDimension(...) +} + +pca.sig.genes <- function(...) { + .Deprecated("PCASigGenes ", package="Seurat") + PCASigGenes(...) +} + +doHeatMap <- function(...) { + .Deprecated("DoHeatmap ", package="Seurat") + DoHeatmap(...) +} + +icHeatmap <- function(...) { + .Deprecated("ICHeatmap ", package="Seurat") + ICHeatmap(...) +} + +doKMeans <- function(...) { + .Deprecated("DoKMeans ", package="Seurat") + DoKMeans(...) +} + +genes.in.cluster <- function(...) { + .Deprecated("GenesInCluster ", package="Seurat") + GenesInCluster(...) +} + +kMeansHeatmap <- function(...) { + .Deprecated("KMeansHeatmap ", package="Seurat") + KMeansHeatmap(...) +} + +cell.cor.matrix <- function(...) { + .Deprecated("CellCorMatrix ", package="Seurat") + CellCorMatrix(...) +} + +gene.cor.matrix <- function(...) { + .Deprecated("GeneCorMatrix ", package="Seurat") + GeneCorMatrix(...) +} + +calinskiPlot <- function(...) { + .Deprecated("CalinskiPlot ", package="Seurat") + CalinskiPlot(...) +} + +dot.plot <- function(...) { + .Deprecated("DotPlot ", package="Seurat") + DotPlot(...) +} + +addMetaData <- function(...) { + .Deprecated("AddMetaData ", package="Seurat") + AddMetaData(...) +} + +removePC <- function(...) { + .Deprecated("RemovePC ", package="Seurat") + RemovePC(...) +} + +geneScorePlot <- function(...) { + .Deprecated( + package = "Seurat", + msg = 'geneScorePlot has been removed without replacement' + ) +} + +cellPlot <- function(...) { + .Deprecated("CellPlot ", package="Seurat") + CellPlot(...) +} + +jackStraw.permutation.test <- function(...) { + .Deprecated("JackStrawPermutationTest ", package="Seurat") + JackStrawPermutationTest(...) +} + +jackStrawMC <- function(...) { + .Deprecated("JackStrawMC ", package="Seurat") + JackStrawMC(...) +} + +jackStrawFull <- function(...) { + .Deprecated("JackStrawFull ", package="Seurat") + JackStrawFull(...) +} + +PCAFast <- function(...) { + .Deprecated("PCA", package= "Seurat") + PCA(...) +} + +writ.table <- function(...) { + .Deprecated( + new = 'write.table', + package = 'Seurat', + msg = "'writ.table' no longer exists, use 'write.table' instead" + ) +} + +jackRandom <- function(...) { + .Deprecated(new = 'JackRandom', package = 'Seurat') + JackRandom(...) +} + +MeanVarPlot <- function(...) { + .Deprecated(new = 'FindVariableGenes', package = 'Seurat') + FindVariableGenes(...) +} + +HeatmapNode <- function(...) { + .Deprecated(new = 'NodeHeatmap', package = 'Seurat') + NodeHeatmap(...) +} + +myPalette <- function(...) { + .Deprecated(new = 'CustomPalette', package = 'Seurat') + CustomPalette(...) +} + +minusr <- function(...) { + .Deprecated( + new = 'SubsetRow', + package = 'Seurat', + msg = "Use SubsetRow with 'invert = TRUE' instead" + ) + SubsetRow(..., invert = TRUE) +} + +minusc <- function(...) { + .Deprecated( + new = 'SubsetColumn', + package = 'Seurat', + msg = "Use SubsetColumn with 'invert = TRUE' instead" + ) + SubsetColumn(..., invert = TRUE) +} + + diff --git a/R/differential_expression.R b/R/differential_expression.R new file mode 100644 index 000000000..9d42dc65f --- /dev/null +++ b/R/differential_expression.R @@ -0,0 +1,1036 @@ +#' Gene expression markers of identity classes +#' +#' Finds markers (differentially expressed genes) for identity classes +#' +#' @param object Seurat object +#' @param ident.1 Identity class to define markers for +#' @param ident.2 A second identity class for comparison. If NULL (default) - +#' use all other cells for comparison. +#' @param genes.use Genes to test. Default is to use all genes +#' @param thresh.use Limit testing to genes which show, on average, at least +#' X-fold difference (log-scale) between the two groups of cells. Default is 0.25 +#' Increasing thresh.use speeds up the function, but can miss weaker signals. +#' @param test.use Denotes which test to use. Seurat currently implements +#' "bimod" (likelihood-ratio test for single cell gene expression, McDavid et +#' al., Bioinformatics, 2013, default), "roc" (standard AUC classifier), "t" +#' (Students t-test), and "tobit" (Tobit-test for differential gene expression, +#' as in Trapnell et al., Nature Biotech, 2014), 'poisson', and 'negbinom'. +#' The latter two options should only be used on UMI datasets, and assume an underlying +#' poisson or negative-binomial distribution +#' @param min.pct - only test genes that are detected in a minimum fraction of min.pct cells +#' in either of the two populations. Meant to speed up the function by not testing genes that are very infrequently expressed. Default is 0.1 +#' @param min.diff.pct - only test genes that show a minimum difference in the fraction of detection between the two groups. Set to -Inf by default +#' @param only.pos Only return positive markers (FALSE by default) +#' @param print.bar Print a progress bar once expression testing begins (uses pbapply to do this) +#' @param max.cells.per.ident Down sample each identity class to a max number. Default is no downsampling. Not activated by default (set to Inf) +#' @param random.seed Random seed for downsampling +#' @param min.cells Minimum number of cells expressing the gene in at least one of the two groups +#' +#' @return Matrix containing a ranked list of putative markers, and associated statistics (p-values, ROC score, etc.) +#' +#' @import VGAM +#' @import pbapply +#' +#' @export +#' +FindMarkers <- function( + object, + ident.1, + ident.2 = NULL, + genes.use = NULL, + thresh.use = 0.25, + test.use = "bimod", + min.pct = 0.1, + min.diff.pct = -Inf, + print.bar = TRUE, + only.pos = FALSE, + max.cells.per.ident = Inf, + random.seed = 1, + latent.vars = "nUMI", + min.cells = 3 +) { + genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data)) + if (max.cells.per.ident < Inf) { + object <- SubsetData( + object = object, + max.cells.per.ident = max.cells.per.ident, + random.seed = random.seed + ) + } + # in case the user passed in cells instead of identity classes + if (length(x = as.vector(x = ident.1) > 1) && any(as.character(x = ident.1) %in% object@cell.names)) { + cells.1 <- intersect(x = ident.1, y = object@cell.names) + } else { + cells.1 <- WhichCells(object = object, ident = ident.1) + } + # if NULL for ident.2, use all other cells + if (length(x = as.vector(x = ident.2) > 1) && any(as.character(x = ident.2) %in% object@cell.names)) { + cells.2 <- intersect(x = ident.2, y = object@cell.names) + } else { + if (is.null(x = ident.2)) { + cells.2 <- object@cell.names + } else { + cells.2 <- WhichCells(object = object, ident = ident.2) + } + } + cells.2 <- setdiff(x = cells.2, y = cells.1) + #error checking + if (length(x = cells.1) == 0) { + print(paste("Cell group 1 is empty - no cells with identity class", ident.1)) + return(NULL) + } + if (length(x = cells.2) == 0) { + print(paste("Cell group 2 is empty - no cells with identity class", ident.2)) + return(NULL) + } + #gene selection (based on percent expressed) + thresh.min <- 0 + data.temp1 <- round( + x = apply( + X = object@data[genes.use, cells.1, drop = F], + MARGIN = 1, + FUN = function(x) { + return(sum(x > thresh.min) / length(x = x)) + # return(length(x = x[x>thresh.min]) / length(x = x)) + } + ), + digits = 3 + ) + data.temp2 <- round( + x = apply( + X = object@data[genes.use, cells.2, drop = F], + MARGIN = 1, + FUN = function(x) { + return(sum(x > thresh.min) / length(x = x)) + # return(length(x = x[x > thresh.min]) / length(x = x)) + } + ), + digits = 3 + ) + data.alpha <- cbind(data.temp1, data.temp2) + colnames(x = data.alpha) <- c("pct.1","pct.2") + alpha.min <- apply(X = data.alpha, MARGIN = 1, FUN = max) + names(x = alpha.min) <- rownames(x = data.alpha) + genes.use <- names(x = which(x = alpha.min > min.pct)) + alpha.diff <- alpha.min - apply(X = data.alpha, MARGIN = 1, FUN = min) + genes.use <- names( + x = which(x = alpha.min > min.pct & alpha.diff > min.diff.pct) + ) + + if (length(cells.1) < min.cells) { + stop(paste("Cell group 1 has fewer than", as.character(min.cells), "cells in identity class", ident.1)) + } + if (length(cells.2) < min.cells) { + stop(paste("Cell group 2 has fewer than", as.character(min.cells), " cells in identity class", ident.2)) + } + + #gene selection (based on average difference) + data.1 <- apply(X = object@data[genes.use, cells.1, drop = F], MARGIN = 1, FUN = ExpMean) + data.2 <- apply(X = object@data[genes.use, cells.2, drop = F], MARGIN = 1, FUN = ExpMean) + total.diff <- (data.1 - data.2) + genes.diff <- names(x = which(x = abs(x = total.diff) > thresh.use)) + genes.use <- intersect(x = genes.use, y = genes.diff) + #perform DR + if (test.use == "bimod") { + to.return <- DiffExpTest( + object = object, + cells.1 = cells.1, + cells.2 = cells.2, + genes.use = genes.use, + print.bar = print.bar + ) + } + if (test.use == "roc") { + to.return <- MarkerTest( + object = object, + cells.1 = cells.1, + cells.2 = cells.2, + genes.use = genes.use, + print.bar = print.bar + ) + } + if (test.use == "t") { + to.return <- DiffTTest( + object = object, + cells.1 = cells.1, + cells.2 = cells.2, + genes.use = genes.use, + print.bar = print.bar + ) + } + if (test.use == "tobit") { + to.return <- TobitTest( + object = object, + cells.1 = cells.1, + cells.2 = cells.2, + genes.use = genes.use, + print.bar = print.bar + ) + } + if (test.use == "negbinom") { + to.return <- NegBinomDETest( + object = object, + cells.1 = cells.1, + cells.2 = cells.2, + genes.use = genes.use, + latent.vars = latent.vars, + print.bar = print.bar, + min.cells = min.cells + ) + } + if (test.use == "poisson") { + to.return <- PoissonDETest( + object = object, + cells.1 = cells.1, + cells.2 = cells.2, + genes.use = genes.use, + latent.vars = latent.vars, + print.bar = print.bar, + min.cells # PoissonDETest doesn't have something for min.cells + ) + } + #return results + to.return[, "avg_diff"] <- total.diff[rownames(x = to.return)] + to.return <- cbind(to.return, data.alpha[rownames(x = to.return), ]) + if (test.use == "roc") { + to.return <- to.return[order(-to.return$power, -to.return$avg_diff), ] + } else { + to.return <- to.return[order(to.return$p_val, -to.return$avg_diff), ] + } + if (only.pos) { + to.return <- subset(x = to.return, subset = avg_diff > 0) + } + return(to.return) +} + +#' Gene expression markers for all identity classes +#' +#' Finds markers (differentially expressed genes) for each of the identity classes in a dataset +#' +#' @param object Seurat object +#' @param genes.use Genes to test. Default is to all genes +#' @param thresh.use Limit testing to genes which show, on average, at least +#' X-fold difference (log-scale) between the two groups of cells. +#' Increasing thresh.use speeds up the function, but can miss weaker signals. +#' @param test.use Denotes which test to use. Seurat currently implements +#' "bimod" (likelihood-ratio test for single cell gene expression, McDavid et +#' al., Bioinformatics, 2013, default), "roc" (standard AUC classifier), "t" +#' (Students t-test), and "tobit" (Tobit-test for differential gene expression, +#' as in Trapnell et al., Nature Biotech, 2014), 'poisson', and 'negbinom'. +#' The latter two options should only be used on UMI datasets, and assume an underlying +#' poisson or negative-binomial distribution +#' @param min.pct - only test genes that are detected in a minimum fraction of min.pct cells +#' in either of the two populations. Meant to speed up the function by not testing genes that are very infrequently expression +#' @param min.diff.pct - only test genes that show a minimum difference in the fraction of detection between the two groups. Set to -Inf by default +#' @param only.pos Only return positive markers (FALSE by default) +#' @param print.bar Print a progress bar once expression testing begins (uses pbapply to do this) +#' @param max.cells.per.ident Down sample each identity class to a max number. Default is no downsampling. +#' @param random.seed Random seed for downsampling +#' @param return.thresh Only return markers that have a p-value < return.thresh, or a power > return.thresh (if the test is ROC) +#' @param do.print FALSE by default. If TRUE, outputs updates on progress. +#' @param min.cells Minimum number of cells expressing the gene in at least one of the two groups +#' @param latent.vars remove the effects of these variables +#' +#' @return Matrix containing a ranked list of putative markers, and associated +#' statistics (p-values, ROC score, etc.) +#' +#' @export +#' +FindAllMarkers <- function( + object, + genes.use = NULL, + thresh.use = 0.25, + test.use = "bimod", + min.pct = 0.1, + min.diff.pct = 0.05, + print.bar = TRUE, + only.pos = FALSE, + max.cells.per.ident = Inf, + return.thresh = 1e-2, + do.print = FALSE, + random.seed = 1, + min.cells = 3, + latent.vars = "nUMI" +) { + genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data)) + ident.use <- object@ident + if ((test.use == "roc") && (return.thresh == 1e-2)) { + return.thresh = 0.7 + } + idents.all <- sort(x = unique(x = object@ident)) + genes.de <- list() + if (max.cells.per.ident < Inf) { + object <- SubsetData( + object = object, + max.cells.per.ident = max.cells.per.ident, + random.seed = random.seed + ) + } + for (i in 1:length(x = idents.all)) { + genes.de[[i]] <- FindMarkers( + object = object, + ident.1 = idents.all[i], + ident.2 = NULL, + genes.use = genes.use, + thresh.use = thresh.use, + test.use = test.use, + min.pct = min.pct, + min.diff.pct = min.diff.pct, + print.bar = print.bar, + min.cells = min.cells, + latent.vars = latent.vars + ) + if (do.print) { + print(paste("Calculating cluster", idents.all[i])) + } + } + gde.all <- data.frame() + for (i in 1:length(x = idents.all)) { + gde <- genes.de[[i]] + if (is.null(unlist(gde))) next + if (nrow(x = gde) > 0) { + if (test.use == "roc") { + gde <- subset( + x = gde, + subset = (myAUC > return.thresh | myAUC < (1 - return.thresh)) + ) + } else { + gde <- gde[order(gde$p_val, -gde$avg_diff), ] + gde <- subset(x = gde, subset = p_val < return.thresh) + } + if (nrow(x = gde) > 0) { + gde$cluster <- idents.all[i] + gde$gene <- rownames(x = gde) + } + if (nrow(x = gde) > 0) { + gde.all <- rbind(gde.all, gde) + } + } + } + if (only.pos) { + return(subset(x = gde.all, subset = avg_diff > 0)) + } + return(gde.all) +} + +#' Gene expression markers of identity classes defined by a phylogenetic clade +#' +#' Finds markers (differentially expressed genes) based on a branching point (node) in +#' the phylogenetic tree. Markers that define clusters in the left branch are positive markers. +#' Markers that define the right branch are negative markers. +#' +#' @inheritParams FindMarkers +#' @param node The node in the phylogenetic tree to use as a branch point +#' @param tree.use Can optionally pass the tree to be used. Default uses the tree in object@@cluster.tree +#' @param ... Additional arguments passed to FindMarkers +#' +#' @return Matrix containing a ranked list of putative markers, and associated +#' statistics (p-values, ROC score, etc.) +#' +#' @export +#' +FindMarkersNode <- function( + object, + node, + tree.use = NULL, + genes.use = NULL, + thresh.use = 0.25, + test.use = "bimod", + ... +) { + genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data)) + tree <- SetIfNull(x = tree.use, default = object@cluster.tree[[1]]) + ident.order <- tree$tip.label + nodes.1 <- ident.order[GetLeftDescendants(tree = tree, node = node)] + nodes.2 <- ident.order[GetRightDescendants(tree = tree, node = node)] + #print(nodes.1) + #print(nodes.2) + to.return <- FindMarkers( + object = object, + ident.1 = nodes.1, + ident.2 = nodes.2, + genes.use = genes.use, + thresh.use = thresh.use, + test.use = test.use, + ... + ) + return(to.return) +} + +#' Find all markers for a node +#' +#' This function finds markers for all splits at or below the specified node +#' +#' @param object Seurat object. Must have object@@cluster.tree slot filled. Use BuildClusterTree() if not. +#' @param node Node from which to start identifying split markers, default is top node. +#' @param genes.use Genes to test. Default is to use all genes +#' @param thresh.use Limit testing to genes which show, on average, at least +#' X-fold difference (log-scale) between the two groups of cells. +#' @param test.use Denotes which test to use. Seurat currently implements +#' "bimod" (likelihood-ratio test for single cell gene expression, McDavid et +#' al., Bioinformatics, 2013, default), "roc" (standard AUC classifier), "t" +#' (Students t-test), and "tobit" (Tobit-test for differential gene expression, +#' as in Trapnell et al., Nature Biotech, 2014), 'poisson', and 'negbinom'. +#' The latter two options should only be used on UMI datasets, and assume an underlying +#' poisson or negative-binomial distribution. +#' @param min.pct - only test genes that are detected in a minimum fraction of min.pct cells +#' in either of the two populations. Meant to speed up the function by not testing genes that are very infrequently expression +#' @param min.diff.pct - only test genes that show a minimum difference in the fraction of detection between the two groups. Set to -Inf by default +#' @param only.pos Only return positive markers (FALSE by default) +#' @param print.bar Print a progress bar once expression testing begins (uses pbapply to do this) +#' @param max.cells.per.ident Down sample each identity class to a max number. Default is no downsampling. +#' @param random.seed Random seed for downsampling +#' @param return.thresh Only return markers that have a p-value < return.thresh, or a power > return.thresh (if the test is ROC) +#' @param min.cells Minimum number of cells expressing the gene in at least one of the two groups +#' +#' @return Returns a dataframe with a ranked list of putative markers for each node and associated statistics +#' +#' @importFrom ape drop.tip +#' +#' @export +#' +FindAllMarkersNode <- function( + object, + node = NULL, + genes.use = NULL, + thresh.use = 0.25, + test.use = "bimod", + min.pct = 0.1, + min.diff.pct = 0.05, + print.bar = TRUE, + only.pos = FALSE, + max.cells.per.ident = Inf, + return.thresh = 1e-2, + do.print = FALSE, + random.seed = 1, + min.cells = 3 +) { + if(length(object@cluster.tree) == 0){ + stop("Tree hasn't been built yet. Run BuildClusterTree to build.") + } + + genes.use <- SetIfNull(x = genes.use, default = rownames(object@data)) + node <- SetIfNull(x = node, default = object@cluster.tree[[1]]$edge[1, 1]) + ident.use <- object@ident + tree.use <- object@cluster.tree[[1]] + descendants <- DFT(tree = tree.use, node = node, path = NULL, include.children = TRUE) + all.children <- sort(x = tree.use$edge[,2][!tree.use$edge[,2] %in% tree.use$edge[,1]]) + descendants1 <- MapVals(v = descendants, from = all.children, to = tree.use$tip.label) + drop.children <- setdiff(tree.use$tip.label, descendants) + keep.children <- setdiff(tree.use$tip.label, drop.children) + orig.nodes <- c(node, as.numeric(setdiff(descendants, keep.children))) + tree.use <- drop.tip(tree.use, drop.children) + new.nodes <- unique(tree.use$edge[,1]) + if ((test.use == 'roc') && (return.thresh==1e-2)) { + return.thresh <- 0.7 + } + genes.de <- list() + for (i in ((tree.use$Nnode+2):max(tree.use$edge))) { + genes.de[[i]] <- FindMarkersNode( + object = object, + node = i, + tree.use = tree.use, + genes.use = genes.use, + thresh.use = thresh.use, + test.use = test.use, + min.pct = min.pct, + min.diff.pct = min.diff.pct, + print.bar = print.bar, + only.pos = only.pos, + max.cells.per.ident = max.cells.per.ident, + random.seed = random.seed, + min.cells = min.cells + ) + if (do.print) { + print(paste("Calculating node", i)) + } + } + gde.all <- data.frame() + for (i in ((tree.use$Nnode+2):max(tree.use$edge))) { + gde <- genes.de[[i]] + if (is.null(x = unlist(gde))) { + next + } + if (nrow(x = gde) > 0) { + if (test.use == 'roc') { + gde <- subset( + x = gde, + subset = (myAUC > return.thresh | myAUC < (1 - return.thresh)) + ) + } + if ( (test.use == 'bimod') || (test.use == 't')) { + gde <- gde[order(gde$p_val,-gde$avg_diff), ] + gde <- subset(x = gde, subset = p_val < return.thresh) + } + if (nrow(x = gde) > 0) { + gde$cluster <- i + gde$gene <- rownames(x = gde) + } + if (nrow(x = gde) > 0) { + gde.all <- rbind(gde.all,gde) + } + } + } + gde.all$cluster <- MapVals(v = gde.all$cluster, + from = new.nodes, + to = orig.nodes + ) + return(gde.all) +} + +#' Finds markers that are conserved between the two groups +#' +#' @param object Seurat object +#' @param ident.1 Identity class to define markers for +#' @param ident.2 A second identity class for comparison. If NULL (default) - use all other cells +#' for comparison. +#' @param grouping.var grouping variable +#' @param \dots parameters to pass to FindMarkers +#' +#' @return Matrix containing a ranked list of putative conserved markers, and associated statistics +#' (p-values within each group and a combined p-value (fisher_pval), percentage of cells expressing +#' the marker, average differences) +#' +#' @export +#' +FindConservedMarkers <- function( + object, + ident.1, + ident.2 = NULL, + grouping.var, + ... +) { + object.var <- FetchData(object = object, vars.all = grouping.var) + object <- SetIdent( + object = object, + cells.use = object@cell.names, + ident.use = paste(object@ident, object.var[, 1], sep = "_") + ) + levels.split <- names(x = sort(x = table(object.var[, 1]))) + if (length(x = levels.split) != 2) { + stop( + paste0( + "There are not two options for ", + grouping.var, + ". \n Current groups include: ", + paste(levels.split, collapse = ", ") + ) + ) + } + cells <- list() + for (i in 1:2) { + cells[[i]] <- rownames( + x = object.var[object.var[, 1] == levels.split[i], , drop = FALSE] + ) + } + marker.test <- list() + # do marker tests + for (i in 1:2) { + level.use <- levels.split[i] + ident.use.1 <- paste(ident.1, level.use, sep = "_") + cells.1 <- WhichCells(object = object, ident = ident.use.1) + if (is.null(x = ident.2)) { + cells.2 <- setdiff(x = cells[[i]], y = cells.1) + ident.use.2 <- names(x = which(x = table(object@ident[cells.2]) > 0)) + if (length(x = ident.use.2) == 0) { + stop(paste("Only one identity class present:", ident.1)) + } + } + if (! is.null(x = ident.2)) { + ident.use.2 <- paste(ident.2, level.use, sep = "_") + } + cat( + paste0( + "Testing ", + ident.use.1, + " vs ", + paste(ident.use.2, collapse = ", "), "\n" + ), + file = stderr() + ) + marker.test[[i]] <- FindMarkers( + object = object, + ident.1 = ident.use.1, + ident.2 = ident.use.2, + ... + ) + } + genes.conserved <- intersect( + x = rownames(x = marker.test[[1]]), + y = rownames(x = marker.test[[2]]) + ) + markers.conserved <- list() + for (i in 1:2) { + markers.conserved[[i]] <- marker.test[[i]][genes.conserved, ] + colnames(x = markers.conserved[[i]]) <- paste( + levels.split[i], + colnames(x = markers.conserved[[i]]), + sep="_" + ) + } + markers.combined <- cbind(markers.conserved[[1]], markers.conserved[[2]]) + pval.codes <- paste(levels.split, "p_val", sep = "_") + markers.combined$max_pval <- apply( + X = markers.combined[, pval.codes], + MARGIN = 1, + FUN = max + ) + markers.combined$fisher_pval <- apply( + X = markers.combined[, pval.codes], + MARGIN = 1, + FUN = FisherIntegrate + ) + markers.combined <- markers.combined[order(markers.combined$fisher_pval), ] + return(markers.combined) +} + +#' Likelihood ratio test for zero-inflated data +#' +#' Identifies differentially expressed genes between two groups of cells using +#' the LRT model proposed in McDavid et al, Bioinformatics, 2013 +#' +#' @inheritParams FindMarkers +#' @param object Seurat object +#' @param cells.1 Group 1 cells +#' @param cells.2 Group 2 cells +#' @return Returns a p-value ranked matrix of putative differentially expressed +#' genes. +#' +#' @export +#' +DiffExpTest <- function( + object, + cells.1, + cells.2, + genes.use = NULL, + print.bar = TRUE +) { + genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data)) + if (print.bar) { + iterate.fxn <- pblapply + } else { + iterate.fxn <- lapply + } + p_val <- unlist( + x = iterate.fxn( + X = genes.use, + FUN = function(x) { + return( + DifferentialLRT( + x = as.numeric(x = object@data[x, cells.1]), + y = as.numeric(x = object@data[x, cells.2]) + ) + ) + } + ) + ) + to.return <- data.frame(p_val, row.names = genes.use) + return(to.return) +} + +#' Negative binomial test for UMI-count based data +#' +#' Identifies differentially expressed genes between two groups of cells using +#' a negative binomial generalized linear model +# +#' +#' @inheritParams FindMarkers +#' @param object Seurat object +#' @param cells.1 Group 1 cells +#' @param cells.2 Group 2 cells +#' +#' @return Returns a p-value ranked matrix of putative differentially expressed +#' genes. +#' +#' @importFrom MASS glm.nb +#' @importFrom pbapply pbapply +#' +#' @export +#' +NegBinomDETest <- function( + object, + cells.1, + cells.2, + genes.use = NULL, + latent.vars = NULL, + print.bar = TRUE, + min.cells = 3 +) { + genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data)) + # check that the gene made it through the any filtering that was done + genes.use <- genes.use[genes.use %in% rownames(x = object@data)] + my.latent <- FetchData( + object = object, + vars.all = latent.vars, + cells.use = c(cells.1, cells.2), + use.raw = TRUE + ) + to.test.data <- object@raw.data[genes.use, c(cells.1, cells.2)] + to.test <- data.frame(my.latent, row.names = c(cells.1, cells.2)) + to.test[cells.1, "group"] <- "A" + to.test[cells.2, "group"] <- "B" + to.test$group <- factor(x = to.test$group) + latent.vars <- c("group", latent.vars) + if (print.bar) { + iterate.fxn <- pblapply + } else { + iterate.fxn <- lapply + } + p_val <- unlist( + x = iterate.fxn( + X = genes.use, + FUN = function(x) { + to.test[, "GENE"] <- as.numeric(x = to.test.data[x, ]) + # check that gene is expressed in specified number of cells in one group + if (sum(to.test$GENE[to.test$group == "A"]) < min.cells || + sum(to.test$GENE[to.test$group == "B"]) < min.cells) { + warning(paste0( + "Skipping gene --- ", + x, + ". Fewer than ", + min.cells, + " in at least one of the two clusters." + )) + return(2) + } + # check that variance between groups is not 0 + if (var(x = to.test$GENE) == 0) { + warning(paste0( + "Skipping gene -- ", + x, + ". No variance in expression between the two clusters." + )) + return(2) + } + fmla <- as.formula(paste0("GENE ", " ~ ", paste(latent.vars, collapse = "+"))) + p.estimate <- 2 + try( + expr = p.estimate <- summary( + object = glm.nb(formula = fmla, data = to.test) + )$coef[2, 4], + silent = TRUE + ) + return(p.estimate) + } + ) + ) + if (length(x = which(x = p_val == 2)) > 0){ + genes.use <- genes.use[-which(x = p_val == 2)] + p_val <- p_val[! p_val == 2] + } + to.return <- data.frame(p_val, row.names = genes.use) + return(to.return) +} + +#' Negative binomial test for UMI-count based data (regularized version) +#' +#' Identifies differentially expressed genes between two groups of cells using +#' a likelihood ratio test of negative binomial generalized linear models where +#' the overdispersion parameter theta is determined by pooling information +#' across genes. +#' +#' @inheritParams FindMarkers +#' @param object Seurat object +#' @param cells.1 Group 1 cells +#' @param cells.2 Group 2 cells +#' +#' @return Returns a p-value ranked data frame of test results. +#' +#' @export +#' +NegBinomRegDETest <- function( + object, + cells.1, + cells.2, + genes.use = NULL, + latent.vars = NULL, + print.bar = TRUE, + min.cells = 3 +) { + genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data)) + # check that the gene made it through the any filtering that was done + genes.use <- genes.use[genes.use %in% rownames(x = object@data)] + print( + sprintf( + 'NegBinomRegDETest for %d genes and %d and %d cells', + length(x = genes.use), + length(x = cells.1), + length(x = cells.2) + ) + ) + grp.fac <- factor( + x = c( + rep.int(x = 'A', times = length(x = cells.1)), + rep.int(x = 'B', times = length(x = cells.2)) + ) + ) + to.test.data <- object@raw.data[genes.use, c(cells.1, cells.2), drop = FALSE] + print('Calculating mean per gene per group') + above.threshold <- pmax( + apply(X = to.test.data[, cells.1] > 0, MARGIN = 1, FUN = mean), + apply(X = to.test.data[, cells.2] > 0, MARGIN = 1, FUN = mean) + ) >= 0.02 + print( + sprintf( + '%d genes are detected in at least 2%% of the cells in at least one of the groups and will be tested', + sum(above.threshold) + ) + ) + genes.use <- genes.use[above.threshold] + to.test.data <- to.test.data[genes.use, , drop = FALSE] + my.latent <- FetchData( + object = object, + vars.all = latent.vars, + cells.use = c(cells.1, cells.2), + use.raw = TRUE + ) + to.test <- data.frame(my.latent, row.names = c(cells.1, cells.2)) + print(paste('Latent variables are', latent.vars)) + # get regularized theta (ignoring group factor) + theta.fit <- RegularizedTheta( + cm = to.test.data, + latent.data = to.test, + min.theta = 0.01, + bin.size = 128 + ) + print('Running NB regression model comparison') + to.test$NegBinomRegDETest.group <- grp.fac + bin.size <- 128 + bin.ind <- ceiling(1:length(x = genes.use) / bin.size) + max.bin <- max(bin.ind) + pb <- txtProgressBar(min = 0, max = max.bin, style = 3) + res <- c() + for (i in 1:max.bin) { + genes.bin.use <- genes.use[bin.ind == i] + bin.out.lst <- parallel::mclapply( + X = genes.bin.use, + FUN = function(j) { + return(NBModelComparison( + y = to.test.data[j, ], + theta = theta.fit[j], + latent.data = to.test, + com.frac = latent.vars, + grp.frac = 'NegBinomRegDETest.group' + )) + } + ) + res <- rbind(res, do.call(rbind, bin.out.lst)) + setTxtProgressBar(pb = pb, value = i) + } + close(pb) + rownames(res) <- genes.use + res <- as.data.frame(x = res) + res$adj.pval <- p.adjust(p = res$pval, method='fdr') + res <- res[order(res$pval, -abs(x = res$log.fc)), ] + return(res) +} + +#' Poisson test for UMI-count based data +#' +#' Identifies differentially expressed genes between two groups of cells using +#' a poisson generalized linear model +# +#' @inheritParams FindMarkers +#' @param object Seurat object +#' @param cells.1 Group 1 cells +#' @param cells.2 Group 2 cells +#' +#' @return Returns a p-value ranked matrix of putative differentially expressed +#' genes. +#' +#' @importFrom pbapply pbapply +#' +#' @export +#' +PoissonDETest <- function( + object, + cells.1, + cells.2, + genes.use = NULL, + latent.vars = NULL, + print.bar = TRUE +) { + genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data)) + # check that the gene made it through the any filtering that was done + genes.use <- genes.use[genes.use %in% rownames(x = object@data)] + my.latent <- FetchData( + object = object, + vars.all = latent.vars, + cells.use = c(cells.1, cells.2), + use.raw = TRUE + ) + to.test.data <- object@raw.data[genes.use, c(cells.1, cells.2)] + to.test <- data.frame(my.latent, row.names = c(cells.1, cells.2)) + to.test[cells.1,"group"] <- "A" + to.test[cells.2,"group"] <- "B" + to.test$group <- factor(x = to.test$group) + latent.vars <- c("group", latent.vars) + if (print.bar) { + iterate.fxn <- pblapply + } else { + iterate.fxn <- lapply + } + p_val <- unlist( + x = iterate.fxn( + X = genes.use, + FUN = function(x) { + to.test[,"GENE"] <- as.numeric(x = to.test.data[x, ]) + # check that gene is expressed in specified number of cells in one group + if (sum(to.test$GENE[to.test$group == "A"]) < min.cells || + sum(to.test$GENE[to.test$group == "B"]) < min.cells) { + warning(paste0( + "Skipping gene --- ", + x, + ". Fewer than", + min.cells, + " in at least one of the two clusters." + )) + return(2) + } + # check that variance between groups is not 0 + if (var(to.test$GENE) == 0) { + print("what") # what? + warning(paste0( + "Skipping gene -- ", + x, + ". No variance in expression between the two clusters." + )) + return(2) + } + fmla <- as.formula( + object = paste0("GENE ", " ~ ", paste(latent.vars, collapse="+")) + ) + return( + summary( + object = glm( + formula = fmla, + data = to.test, + family = "poisson" + ) + )$coef[2,4] + ) + } + ) + ) + if (length(x = which(x = p_val == 2)) > 0) { + genes.use <- genes.use[-which(x = p_val == 2)] + p_val <- p_val[! p_val == 2] + } + to.return <- data.frame(p_val, row.names = genes.use) + return(to.return) +} + +#' Differential expression testing using Tobit models +#' +#' Identifies differentially expressed genes between two groups of cells using +#' Tobit models, as proposed in Trapnell et al., Nature Biotechnology, 2014 +#' +#' @inheritParams FindMarkers +#' @inheritParams DiffExpTest +#' +#' @return Returns a p-value ranked matrix of putative differentially expressed +#' genes. +#' +#' @export +#' +TobitTest <- function( + object, + cells.1, + cells.2, + genes.use = NULL, + print.bar = TRUE +) { + genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data)) + #print(genes.diff) + to.return <- TobitDiffExpTest( + data1 = object@data[, cells.1], + data2 = object@data[, cells.2], + mygenes = genes.use, + print.bar = print.bar + ) + return(to.return) +} + +#' ROC-based marker discovery +#' +#' Identifies 'markers' of gene expression using ROC analysis. For each gene, +#' evaluates (using AUC) a classifier built on that gene alone, to classify +#' between two groups of cells. +#' +#' An AUC value of 1 means that expression values for this gene alone can +#' perfectly classify the two groupings (i.e. Each of the cells in cells.1 +#' exhibit a higher level than each of the cells in cells.2). An AUC value of 0 +#' also means there is perfect classification, but in the other direction. A +#' value of 0.5 implies that the gene has no predictive power to classify the +#' two groups. +#' +#' @inheritParams FindMarkers +#' @inheritParams DiffExpTest +#' @param object Seurat object +#' +#' @return Returns a 'predictive power' (abs(AUC-0.5)) ranked matrix of +#' putative differentially expressed genes. +#' +#' @import ROCR +#' +#' @export +#' +MarkerTest <- function( + object, + cells.1, + cells.2, + genes.use = NULL, + print.bar = TRUE +) { + genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data)) + to.return <- AUCMarkerTest( + data1 = object@data[, cells.1], + data2 = object@data[, cells.2], + mygenes = genes.use, + print.bar = print.bar + ) + to.return$power <- abs(x = to.return$myAUC - 0.5) * 2 + #print(head(to.return)) + return(to.return) +} + +#' Differential expression testing using Student's t-test +#' +#' Identify differentially expressed genes between two groups of cells using +#' the Student's t-test +#' +#' @inheritParams FindMarkers +#' @inheritParams DiffExpTest +#' +#' @return Returns a p-value ranked matrix of putative differentially expressed +#' genes. +#' +#' @importFrom pbapply pblapply +#' +#' @export +#' +DiffTTest <- function( + object, + cells.1, + cells.2, + genes.use = NULL, + print.bar = TRUE +) { + genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data)) + data.use <- object@data + if (print.bar) { + iterate.fxn=pblapply + } else { + iterate.fxn <- lapply + } + p_val <- unlist( + x = iterate.fxn( + X = genes.use, + FUN = function(x) { + t.test(x = object@data[x, cells.1], y = object@data[x, cells.2])$p.value + } + ) + ) + to.return <- data.frame(p_val,row.names = genes.use) + return(to.return) +} diff --git a/R/differential_expression_internal.R b/R/differential_expression_internal.R new file mode 100644 index 000000000..7c1e9ff4f --- /dev/null +++ b/R/differential_expression_internal.R @@ -0,0 +1,261 @@ +#internal function to run mcdavid et al. DE test +DifferentialLRT <- function(x, y, xmin = 0) { + lrtX <- bimodLikData(x = x) + lrtY <- bimodLikData(x = y) + lrtZ <- bimodLikData(x = c(x, y)) + lrt_diff <- 2 * (lrtX + lrtY - lrtZ) + return(pchisq(q = lrt_diff, df = 3, lower.tail = F)) +} + +#internal function to run mcdavid et al. DE test +bimodLikData <- function(x, xmin = 0) { + x1 <- x[x <= xmin] + x2 <- x[x > xmin] + xal <- MinMax( + data = length(x = x2) / length(x = x), + min = 1e-5, + max = (1 - 1e-5) + ) + likA <- length(x = x1) * log(x = 1 - xal) + if (length(x = x2) < 2) { + mysd <- 1 + } else { + mysd <- sd(x = x2) + } + likB <- length(x = x2) * + log(x = xal) + + sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE)) + return(likA + likB) +} + +#internal function to run Tobit DE test +TobitDiffExpTest <- function(data1, data2, mygenes, print.bar) { + p_val <- unlist(x = lapply( + X = mygenes, + FUN = function(x) { + return(DifferentialTobit( + x1 = as.numeric(x = data1[x, ]), + x2 = as.numeric(x = data2[x, ]) + ))} + )) + p_val[is.na(x = p_val)] <- 1 + if (print.bar) { + iterate.fxn <- pblapply + } else { + iterate.fxn <- lapply + } + toRet <- data.frame(p_val, row.names = mygenes) + return(toRet) +} + +#internal function to run Tobit DE test +DifferentialTobit <- function(x1, x2, lower = 1, upper = Inf) { + my.df <- data.frame( + c(x1, x2), + c(rep(x = 0, length(x = x1)), rep(x = 1, length(x = x2))) + ) + colnames(x = my.df) <- c("Expression", "Stat") + #model.v1=vgam(Expression~1,family = tobit(Lower = lower,Upper = upper),data = my.df) + model.v1 <- TobitFitter( + x = my.df, + modelFormulaStr = "Expression~1", + lower = lower, + upper = upper + ) + #model.v2=vgam(Expression~Stat+1,family = tobit(Lower = lower,Upper = upper),data = my.df) + model.v2 <- TobitFitter( + x = my.df, + modelFormulaStr = "Expression~Stat+1", + lower = lower, + upper = upper + ) + # if (is.null(x = model.v1) == FALSE && is.null(x = model.v2) == FALSE) { + if (! is.null(x = model.v1) && ! is.null(x = model.v2)) { + p <- pchisq( + q = 2 * (logLik(object = model.v2) - logLik(object = model.v1)), + df = 1, + lower.tail = FALSE + ) + } else { + p <- 1 + } + return(p) +} + +#internal function to run Tobit DE test +#credit to Cole Trapnell for this +TobitFitter <- function(x, modelFormulaStr, lower = 1, upper = Inf){ + tryCatch( + expr = return(suppressWarnings(expr = vgam( + formula = as.formula(object = modelFormulaStr), + family = tobit(Lower = lower, Upper = upper), + data = x + ))), + #warning = function(w) { FM_fit }, + error = function(e) { NULL } + ) +} + +# internal function to calculate AUC values +AUCMarkerTest <- function(data1, data2, mygenes, print.bar = TRUE) { + myAUC <- unlist(x = lapply( + X = mygenes, + FUN = function(x) { + return(DifferentialAUC( + x = as.numeric(x = data1[x, ]), + y = as.numeric(x = data2[x, ]) + )) + } + )) + myAUC[is.na(x = myAUC)] <- 0 + if (print.bar) { + iterate.fxn <- pblapply + } else { + iterate.fxn <- lapply + } + avg_diff <- unlist(x = iterate.fxn( + X = mygenes, + FUN = function(x) { + return( + ExpMean( + x = as.numeric(x = data1[x, ]) + ) - ExpMean( + x = as.numeric(x = data2[x, ]) + ) + ) + } + )) + toRet <- data.frame(cbind(myAUC, avg_diff), row.names = mygenes) + toRet <- toRet[rev(x = order(toRet$myAUC)), ] + return(toRet) +} + +# internal function to calculate AUC values +DifferentialAUC <- function(x, y) { + prediction.use <- prediction( + predictions = c(x, y), + labels = c(rep(x = 1, length(x = x)), rep(x = 0, length(x = y))), + label.ordering = 0:1 + ) + perf.use <- performance(prediction.obj = prediction.use, measure = "auc") + auc.use <- round(x = perf.use@y.values[[1]], digits = 3) + return(auc.use) +} + +# given a UMI count matrix, estimate NB theta parameter for each gene +# and use fit of relationship with mean to assign regularized theta to each gene +RegularizedTheta <- function(cm, latent.data, min.theta = 0.01, bin.size = 128) { + genes.regress <- rownames(x = cm) + bin.ind <- ceiling(x = 1:length(x = genes.regress) / bin.size) + max.bin <- max(bin.ind) + print('Running Poisson regression (to get initial mean), and theta estimation per gene') + pb <- txtProgressBar(min = 0, max = max.bin, style = 3) + theta.estimate <- c() + for (i in 1:max.bin) { + genes.bin.regress <- genes.regress[bin.ind == i] + bin.theta.estimate <- unlist( + x = parallel::mclapply( + X = genes.bin.regress, + FUN = function(j) { + return(as.numeric(x = MASS::theta.ml( + y = cm[j, ], + mu = glm( + formula = cm[j, ] ~ ., + data = latent.data, + family = poisson + )$fitted + ))) + } + ), + use.names = FALSE + ) + theta.estimate <- c(theta.estimate, bin.theta.estimate) + setTxtProgressBar(pb = pb, value = i) + } + close(con = pb) + UMI.mean <- apply(X = cm, MARGIN = 1, FUN = mean) + var.estimate <- UMI.mean + (UMI.mean ^ 2) / theta.estimate + for (span in c(1/3, 1/2, 3/4, 1)) { + fit <- loess( + formula = log10(x = var.estimate) ~ log10(x = UMI.mean), + span = span + ) + if (! any(is.na(x = fit$fitted))) { + cat(sprintf( + 'Used loess with span %1.2f to fit mean-variance relationship\n', + span + )) + break + } + } + if (any(is.na(x = fit$fitted))) { + stop('Problem when fitting NB gene variance in RegularizedTheta - NA values were fitted.') + } + theta.fit <- (UMI.mean ^ 2) / ((10 ^ fit$fitted) - UMI.mean) + names(x = theta.fit) <- genes.regress + to.fix <- theta.fit <= min.theta | is.infinite(x = theta.fit) + if (any(to.fix)) { + cat( + 'Fitted theta below', + min.theta, + 'for', + sum(to.fix), + 'genes, setting them to', + min.theta, + '\n' + ) + theta.fit[to.fix] <- min.theta + } + return(theta.fit) +} + +# compare two negative binomial regression models +# model one uses only common factors (com.fac) +# model two additionally uses group factor (grp.fac) +NBModelComparison <- function(y, theta, latent.data, com.fac, grp.fac) { + tab <- as.matrix(x = table(y > 0, latent.data[, grp.fac])) + freqs <- tab['TRUE', ] / apply(X = tab, MARGIN = 2, FUN = sum) + fit2 <- 0 + fit4 <- 0 + try( + expr = fit2 <- glm( + formula = y ~ ., + data = latent.data[, com.fac, drop = FALSE], + family = MASS::negative.binomial(theta = theta) + ), + silent=TRUE + ) + try( + fit4 <- glm( + formula = y ~ ., + data = latent.data[, c(com.fac, grp.fac)], + family = MASS::negative.binomial(theta = theta) + ), + silent = TRUE + ) + if (class(x = fit2)[1] == 'numeric' | class(x = fit4)[1] == 'numeric') { + message('One of the glm.nb calls failed') + return(c(rep(x = NA, 5), freqs)) + } + pval <- anova(fit2, fit4, test = 'Chisq')$'Pr(>Chi)'[2] + foi <- 2 + length(x = com.fac) + log.fc <- log2(x = exp(x = coef(object = fit4)[foi])) #log.fc <- log2(1/exp(coef(fit4)[foi])) + ret <- c( + fit2$deviance, + fit4$deviance, + pval, + coef(object = fit4)[foi], + log.fc, + freqs + ) + names(x = ret) <- c( + 'dev1', + 'dev2', + 'pval', + 'coef', + 'log.fc', + 'freq1', + 'freq2' + ) + return(ret) +} diff --git a/R/dimensional_reduction.R b/R/dimensional_reduction.R new file mode 100644 index 000000000..191318b90 --- /dev/null +++ b/R/dimensional_reduction.R @@ -0,0 +1,1030 @@ +#' Run Principal Component Analysis on gene expression using IRLBA +#' +#' Run a PCA dimensionality reduction. For details about stored PCA calculation +#' parameters, see \code{\link{PrintPCAParams}}. +#' +#' @param object Seurat object +#' @param pc.genes Genes to use as input for PCA. Default is object@@var.genes +#' @param pcs.compute Total Number of PCs to compute and store +#' @param use.imputed Run PCA on imputed values (FALSE by default) +#' @param rev.pca By default computes the PCA on the cell x gene matrix. Setting +#' to true will compute it on gene x cell matrix. +#' @param weight.by.var Weight the cell embeddings by the variance of each PC +#' (weights the gene loadings if rev.pca is TRUE) +#' @param do.print Print the top genes associated with high/low loadings for +#' the PCs +#' @param pcs.print PCs to print genes for +#' @param genes.print Number of genes to print for each PC +#' @param \dots Additional arguments to be passed to IRLBA +#' +#'@importFrom irlba irlba +#' +#' @return Returns Seurat object with the PCA calculation stored in +#' object@@dr$pca. +#' +#' @importFrom irlba irlba +#' +#' @export +#' +RunPCA <- function( + object, + pc.genes = NULL, + pcs.compute = 20, + use.imputed = FALSE, + rev.pca = FALSE, + weight.by.var = TRUE, + do.print = TRUE, + pcs.print = 1:5, + genes.print = 30, + ... +) { + data.use <- PrepDR( + object = object, + genes.use = pc.genes, + use.imputed = use.imputed) + pcs.compute <- min(pcs.compute, ncol(x = data.use)) + if (rev.pca) { + pca.results <- irlba(A = data.use, nv = pcs.compute, ...) + sdev <- pca.results$d/sqrt(max(1, nrow(data.use) - 1)) + if(weight.by.var){ + gene.loadings <- pca.results$u %*% diag(pca.results$d) + } else{ + gene.loadings <- pca.results$u + } + cell.embeddings <- pca.results$v + } + else { + pca.results <- irlba(A = t(x = data.use), nv = pcs.compute, ...) + gene.loadings <- pca.results$v + sdev <- pca.results$d/sqrt(max(1, ncol(data.use) - 1)) + if(weight.by.var){ + cell.embeddings <- pca.results$u %*% diag(pca.results$d) + } else { + cell.embeddings <- pca.results$u + } + } + rownames(x = gene.loadings) <- rownames(x = data.use) + colnames(x = gene.loadings) <- paste0("PC", 1:pcs.compute) + rownames(x = cell.embeddings) <- colnames(x = data.use) + colnames(x = cell.embeddings) <- colnames(x = gene.loadings) + pca.obj <- new( + Class = "dim.reduction", + gene.loadings = gene.loadings, + cell.embeddings = cell.embeddings, + sdev = sdev, + key = "PC" + ) + object@dr$pca <- pca.obj + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("RunPCA"))] + object <- SetCalcParams(object = object, calculation = "RunPCA", ... = parameters.to.store) + if(is.null(object@calc.params$RunPCA$pc.genes)){ + object@calc.params$RunPCA$pc.genes <- rownames(data.use) + } + if(do.print){ + PrintPCA(object = object, pcs.print = pcs.print, genes.print = genes.print) + } + return(object) +} + + +#' Run Independent Component Analysis on gene expression +#' +#' Run fastica algorithm from the ica package for ICA dimensionality reduction. +#' For details about stored ICA calculation parameters, see +#' \code{\link{PrintICAParams}}. +#' +#' @param object Seurat object +#' @param ic.genes Genes to use as input for ICA. Default is object@@var.genes +#' @param ics.compute Number of ICs to compute +#' @param use.imputed Run ICA on imputed values (FALSE by default) +#' @param rev.ica By default, computes the dimensional reduction on the cell x +#' gene matrix. Setting to true will compute it on the transpose (gene x cell +#' matrix). +#' @param print.results Print the top genes associated with each dimension +#' @param ics.print ICs to print genes for +#' @param genes.print Number of genes to print for each IC +#' @param ica.function ICA function from ica package to run (options: icafast, +#' icaimax, icajade) +#' @param seed.use Random seed to use for fastica +#' @param \dots Additional arguments to be passed to fastica +#' +#' @importFrom ica icafast icaimax icajade +#' +#' @return Returns Seurat object with an ICA calculation stored in +#' object@@dr$ica +#' +#' @export +#' +RunICA <- function( + object, + ic.genes = NULL, + ics.compute = 50, + use.imputed = FALSE, + rev.ica = FALSE, + print.results = TRUE, + ics.print = 1:5, + genes.print = 50, + ica.function = "icafast", + seed.use = 1, + ... +) { + data.use <- PrepDR( + object = object, + genes.use = ic.genes, + use.imputed = use.imputed) + set.seed(seed = seed.use) + ics.compute <- min(ics.compute, ncol(x = data.use)) + ica.fxn <- eval(parse(text = ica.function)) + if (rev.ica) { + ica.results <- ica.fxn(data.use, nc = ics.compute,...) + cell.embeddings <- ica.results$M + } else { + ica.results <- ica.fxn(t(x = data.use), nc = ics.compute,...) + cell.embeddings <- ica.results$S + } + gene.loadings <- (as.matrix(x = data.use ) %*% as.matrix(x = cell.embeddings)) + colnames(x = gene.loadings) <- paste0("IC", 1:ncol(x = gene.loadings)) + colnames(x = cell.embeddings) <- paste0("IC", 1:ncol(x = cell.embeddings)) + ica.obj <- new( + Class = "dim.reduction", + gene.loadings = gene.loadings, + cell.embeddings = cell.embeddings, + sdev = sqrt(x = ica.results$vafs), + key = "IC" + ) + object@dr$ica <- ica.obj + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("ICA"))] + object <- SetCalcParams(object = object, calculation = "ICA", ... = parameters.to.store) + if(is.null(object@calc.params$ICA$ic.genes)){ + object@calc.params$ICA$ic.genes <- rownames(data.use) + } + if(do.print){ + PrintICA(object = object, ics.print = ics.print, genes.print = genes.print) + } + return(object) +} + + +#' Run t-distributed Stochastic Neighbor Embedding +#' +#' Run t-SNE dimensionality reduction on selected features. Has the option of +#' running in a reduced dimensional space (i.e. spectral tSNE, recommended), +#' or running based on a set of genes. For details about stored TSNE calculation +#' parameters, see \code{\link{PrintTSNEParams}}. +#' +#' @param object Seurat object +#' @param reduction.use Which dimensional reduction (e.g. PCA, ICA) to use for +#' the tSNE. Default is PCA +#' @param cells.use Which cells to analyze (default, all cells) +#' @param dims.use Which dimensions to use as input features +#' @param genes.use If set, run the tSNE on this subset of genes +#' (instead of running on a set of reduced dimensions). Not set (NULL) by default +#' @param seed.use Random seed for the t-SNE +#' @param do.fast If TRUE, uses the Barnes-hut implementation, which runs +#' faster, but is less flexible. TRUE by default. +#' @param add.iter If an existing tSNE has already been computed, uses the +#' current tSNE to seed the algorithm and then adds additional iterations on top +#' of this +#' @param dim.embed The dimensional space of the resulting tSNE embedding +#' (default is 2). For example, set to 3 for a 3d tSNE +#' @param \dots Additional arguments to the tSNE call. Most commonly used is +#' perplexity (expected number of neighbors default is 30) +#' @param distance.matrix If set, tuns tSNE on the given distance matrix +#' instead of data matrix (experimental) +#' +#' @return Returns a Seurat object with a tSNE embedding in +#' object@@dr$tsne@cell.embeddings +#' +#' @importFrom Rtsne Rtsne +#' @importFrom tsne tsne +#' +#' @export +#' +RunTSNE <- function( + object, + reduction.use = "pca", + cells.use = NULL, + dims.use = 1:5, + genes.use = NULL, + seed.use = 1, + do.fast = TRUE, + add.iter = 0, + dim.embed = 2, + distance.matrix = NULL, + ... +) { + if (! is.null(x = distance.matrix)) { + genes.use <- rownames(x = object@data) + } + if (is.null(x = genes.use)) { + data.use <- GetDimReduction( + object = object, + reduction.type = reduction.use, + slot = "cell.embeddings" + )[, dims.use] + } + if (! is.null(x = genes.use)) { + data.use <- t(PrepDR( + object = object, + genes.use = genes.use)) + } + set.seed(seed = seed.use) + if (do.fast) { + if (is.null(x = distance.matrix)) { + data.tsne <- Rtsne(X = as.matrix(x = data.use), dims = dim.embed, ...) + } else { + data.tsne <- Rtsne( + X = as.matrix(x = distance.matrix), + dims = dim.embed, + is_distance=TRUE + ) + } + data.tsne <- data.tsne$Y + } else { + data.tsne <- tsne(X = data.use, k = dim.embed, ...) + } + if (add.iter > 0) { + data.tsne <- tsne( + x = data.use, + initial_config = as.matrix(x = data.tsne), + max_iter = add.iter, + ... + ) + } + colnames(x = data.tsne) <- paste0("tSNE_", 1:ncol(x = data.tsne)) + rownames(x = data.tsne) <- rownames(x = data.use) + object <- SetDimReduction( + object = object, + reduction.type = "tsne", + slot = "cell.embeddings", + new.data = data.tsne + ) + object <- SetDimReduction( + object = object, + reduction.type = "tsne", + slot = "key", + new.data = "tSNE_" + ) + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("RunTSNE"))] + object <- SetCalcParams(object = object, calculation = "RunTSNE", ... = parameters.to.store) + if(!is.null(GetCalcParam(object = object, calculation = "RunTSNE", parameter = "genes.use"))){ + object@calc.params$RunTSNE$genes.use <- colnames(data.use) + object@calc.params$RunTSNE$cells.use <- rownames(data.use) + } + return(object) +} + +#' Project Dimensional reduction onto full dataset +#' +#' Takes a pre-computed dimensional reduction (typically calculated on a subset +#' of genes) and projects this onto the entire dataset (all genes). Note that +#' the cell loadings will remain unchanged, but now there are gene loadings for +#' all genes. +#' +#' +#' @param object Seurat object +#' @param dims.print Number of dims to print genes for +#' @param dims.store Number of dims to store (default is 30) +#' @param genes.print Number of genes with highest/lowest loadings to print for +#' each PC +#' @param replace.dim Replace the existing data (overwrite +#' object@@dr$XXX@gene.loadings), not done by default. +#' @param do.center Center the dataset prior to projection (should be set to TRUE) +#' @param do.print Print top genes associated with the projected dimensions +#' @param assay.type Data type, RNA by default. Can be changed for multimodal +#' datasets (i.e. project a PCA done on RNA, onto CITE-seq data) +#' +#' @return Returns Seurat object with the projected values in +#' object@@dr$XXX@gene.loadings.full +#' +#' @export +#' +ProjectDim <- function( + object, + reduction.type = "pca", + dims.print = 1:5, + dims.store = 30, + genes.print = 30, + replace.dim = FALSE, + do.center = FALSE, + do.print = TRUE, + assay.type = "RNA" +) { + if (! reduction.type %in% names(x = object@dr)) { + stop(paste(reduction.type, "dimensional reduction has not been computed")) + } + data.use <- GetAssayData( + object = object, + assay.type = assay.type, + slot = "scale.data" + ) + if (do.center) { + data.use <- scale(x = as.matrix(x = data.use), center = TRUE, scale = FALSE) + } + cell.embeddings <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "cell.embeddings" + ) + new.gene.loadings.full <- FastMatMult(m1 = data.use, m2 = cell.embeddings) + rownames(x = new.gene.loadings.full) <- rownames(x = data.use) + colnames(x = new.gene.loadings.full) <- colnames(x = cell.embeddings) + object <- SetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "gene.loadings.full", + new.data = new.gene.loadings.full + ) + if (replace.dim) { + object <- SetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "gene.loadings", + new.data = new.gene.loadings.full + ) + } + if (do.print) { + PrintDim( + object = object, + reduction.type = reduction.type, + genes.print = genes.print, + use.full = TRUE, + dims.print = dims.print + ) + } + return(object) +} + + +#' Project Principal Components Analysis onto full dataset +#' +#' Takes a pre-computed PCA (typically calculated on a subset of genes) and +#' projects this onto the entire dataset (all genes). Note that the cell +#' loadings remains unchanged, but now there are gene loading scores for all +#' genes. +#' +#' @param object Seurat object +#' @param do.print Print top genes associated with the projected PCs +#' @param pcs.print Number of PCs to print genes for +#' @param pcs.store Number of PCs to store (default is 30) +#' @param genes.print Number of genes with highest/lowest loadings to print for +#' each PC +#' @param replace.pc Replace the existing PCA (overwite +#' object@@dr$pca@gene.loadings), not done by default. +#' @param do.center Center the dataset prior to projection (should be set to TRUE) +#' +#' @return Returns Seurat object with the projected PCA values in +#' object@@dr$pca@gene.loadings.full +#' +#' @export +#' +ProjectPCA <- function( + object, + do.print = TRUE, + pcs.print = 1:5, + pcs.store = 30, + genes.print = 30, + replace.pc = FALSE, + do.center = FALSE +) { + return(ProjectDim( + object, + reduction.type = "pca", + dims.print = pcs.print, + genes.print = 30, + replace.dim = replace.pc, + do.center = do.center, + do.print = do.print, + dims.store = pcs.store + )) +} + +#' Perform Canonical Correlation Analysis +#' +#' Runs a canonical correlation analysis using a diagonal implementation of CCA. +#' For details about stored CCA calculation parameters, see +#' \code{\link{PrintCCAParams}}. +#' +#' @param object Seurat object +#' @param object2 Optional second object. If object2 is passed, object1 will be +#' considered as group1 and object2 as group2. +#' @param group1 First set of cells (or IDs) for CCA +#' @param group2 Second set of cells (or IDs) for CCA +#' @param group.by Factor to group by (column vector stored in object@@meta.data) +#' @param num.cc Number of canonical vectors to calculate +#' @param genes.use Set of genes to use in CCA. Default is object@@var.genes. If +#' two objects are given, the default is the union of both variable gene sets +#' that are also present in both objects. +#' @param scale.data Use the scaled data from the object +#' @param rescale.groups Rescale each set of cells independently +#' @return Returns Seurat object with the CCA stored in the @@dr$cca slot. If +#' one object is passed, the same object is returned. If two are passed, a +#' combined object is returned. +#' @export +RunCCA <- function( + object, + object2, + group1, + group2, + group.by, + num.cc = 20, + genes.use, + scale.data = TRUE, + rescale.groups = FALSE +) { + if (! missing(x = object2) && (! missing(x = group1) || ! missing(x = group2))) { + warning("Both object2 and group set. Continuing with objects defining the groups") + } + if (! missing(x = object2)) { + if (missing(x = genes.use)) { + genes.use <- union(x = object@var.genes, y = object2@var.genes) + if (length(x = genes.use) == 0) { + stop("No variable genes present. Run MeanVarPlot and retry") + } + } + if (scale.data) { + possible.genes <- intersect( + x = rownames(x = object@scale.data), + y = rownames(x = object2@scale.data) + ) + genes.use <- genes.use[genes.use %in% possible.genes] + data.use1 <- object@scale.data[genes.use, ] + data.use2 <- object2@scale.data[genes.use, ] + } else { + possible.genes <- intersect( + x = rownames(object@data), + y = rownames(object2@data) + ) + genes.use <- genes.use[genes.use %in% possible.genes] + data.use1 <- object@data[genes.use, ] + data.use2 <- object2@data[genes.use, ] + } + if (length(x = genes.use) == 0) { + stop("0 valid genes in genes.use") + } + } else { + if (missing(x = group1)) { + stop("group1 not set") + } + if (missing(x = group2)) { + stop("group2 not set") + } + if (! missing(x = group.by)) { + if (! group.by %in% colnames(x = object@meta.data)) { + stop("invalid group.by parameter") + } + } + if (missing(x = genes.use)) { + genes.use <- object@var.genes + if (length(x = genes.use) == 0) { + stop("No variable genes present. Run MeanVarPlot and retry") + } + } + if (missing(x = group.by)) { + cells.1 <- CheckGroup(object = object, group = group1, group.id = "group1") + cells.2 <- CheckGroup(object = object, group = group2, group.id = "group2") + } else { + object.current.ids <- object@ident + object <- SetAllIdent(object = object, id = group.by) + cells.1 <- CheckGroup(object = object, group = group1, group.id = "group1") + cells.2 <- CheckGroup(object = object, group = group2, group.id = "group2") + object <- SetIdent( + object = object, + cells.use = object@cell.names, + ident.use = object.current.ids + ) + } + if (scale.data) { + if (rescale.groups) { + data.use1 <- ScaleData( + object = object, + data.use = object@data[genes.use, cells.1] + ) + data.use1 <- data.use1@scale.data + data.use2 <- ScaleData( + object = object, + data.use = object@data[genes.use, cells.2] + ) + data.use2 <- data.use2@scale.data + } else { + data.use1 <- object@scale.data[genes.use, cells.1] + data.use2 <- object@scale.data[genes.use, cells.2] + } + } else { + data.use1 <- object@data[genes.use, cells.1] + data.use2 <- object@data[genes.use, cells.2] + } + } + genes.use <- CheckGenes(data.use = data.use1, genes.use = genes.use) + genes.use <- CheckGenes(data.use = data.use2, genes.use = genes.use) + data.use1 <- data.use1[genes.use, ] + data.use2 <- data.use2[genes.use, ] + + cat("Running CCA\n", file = stderr()) + + cca.results <- CanonCor( + mat1 = data.use1, + mat2 = data.use2, + standardize = TRUE, + k = num.cc + ) + cca.data <- rbind(cca.results$u, cca.results$v) + colnames(x = cca.data) <- paste0("CC", 1:num.cc) + if (! missing(x = object2)) { + cat("Merging objects\n", file = stderr()) + combined.object <- MergeSeurat( + object1 = object, + object2 = object2, + do.scale = FALSE, + do.center = FALSE + ) + # to improve, to pull the same normalization and scale params as previously used + combined.object <- ScaleData(object = combined.object) + combined.object@scale.data[is.na(x = combined.object@scale.data)] <- 0 + combined.object@var.genes <- genes.use + rownames(cca.data) <- colnames(combined.object@data) + combined.object <- SetDimReduction( + object = combined.object, + reduction.type = "cca", + slot = "cell.embeddings", + new.data = cca.data + ) + combined.object <- SetDimReduction( + object = combined.object, + reduction.type = "cca", + slot = "key", + new.data = "CC" + ) + combined.object <- ProjectDim( + object = combined.object, + reduction.type = "cca", + do.print = FALSE + ) + combined.object <- SetDimReduction( + object = combined.object, + reduction.type = "cca", + slot = "gene.loadings", + new.data = GetGeneLoadings( + object = combined.object, + reduction.type = "cca", + use.full = TRUE, + genes.use = genes.use + ) + ) + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("RunCCA"))] + combined.object <- SetCalcParams(object = combined.object, + calculation = "RunCCA", + ... = parameters.to.store) + combined.object <- SetSingleCalcParam(object = combined.object, + calculation = "RunCCA", + parameter = "object.project", + value = object@project.name) + combined.object <- SetSingleCalcParam(object = combined.object, + calculation = "RunCCA", + parameter = "object2.project", + value = object2@project.name) + return(combined.object) + } else { + object <- SetDimReduction( + object = object, + reduction.type = "cca", + slot = "cell.embeddings", + new.data = cca.data + ) + object <- SetDimReduction( + object = object, + reduction.type = "cca", + slot = "key", + new.data = "CC" + ) + + object <- ProjectDim(object = object, + reduction.type = "cca", + do.print = FALSE) + object@scale.data[is.na(x = object@scale.data)] <- 0 + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("RunCCA"))] + object <- SetCalcParams(object = object, + calculation = "RunCCA", + ... = parameters.to.store) + return(object) + } +} + +#' Calculate the ratio of variance explained by ICA or PCA to CCA +#' +#' @param object Seurat object +#' @param reduction.type type of dimensional reduction to compare to CCA (pca, +#' pcafast, ica) +#' @param grouping.var variable to group by +#' @param dims.use Vector of dimensions to project onto (default is the 1:number +#' stored for cca) +#' +#' @return Returns Seurat object with ratio of variance explained stored in +#' object@@meta.data$var.ratio +#' @export +#' +CalcVarExpRatio <- function( + object, + reduction.type = "pca", + grouping.var, + dims.use +) { + if (missing(x = grouping.var)) { + stop("Need to provide grouping variable") + } + if (missing(x = dims.use)) { + dims.use <- 1:ncol(x = GetCellEmbeddings(object = object, reduction.type = "cca")) + } + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("CalcVarExpRatio"))] + object <- SetCalcParams(object = object, + calculation = "CalcVarExpRatio", + ... = parameters.to.store) + groups <- as.vector(x = unique(x = FetchData( + object = object, + vars.all = grouping.var + )[, 1])) + genes.use <- rownames(x = GetGeneLoadings(object = object, reduction.type = "cca")) + var.ratio <- data.frame() + for (group in groups) { + cat(paste("Calculating for", group, "\n"), file = stderr()) + group.cells <- WhichCells( + object = object, + subset.name = grouping.var, + accept.value = group + ) + cat(paste("\t Separating", group, "cells\n"), file = stderr()) + group.object <- SubsetData(object = object, cells.use = group.cells) + cat("\t Running Dimensional Reduction \n", file = stderr()) + ldp.cca <- CalcLDProj( + object = group.object, + reduction.type = "cca", + dims.use = dims.use, + genes.use = genes.use + ) + group.object <- CalcProjectedVar( + object = group.object, + low.dim.data = ldp.cca, + reduction.type = "cca", + dims.use = dims.use, + genes.use = genes.use + ) + if (reduction.type == "pca") { + temp.matrix=PrepDR(group.object,genes.use = genes.use) + group.object <- RunPCA( + object = group.object, + pc.genes = genes.use, + do.print = FALSE, + center=rowMeans(temp.matrix) + ) + ldp.pca <- CalcLDProj( + object = group.object, + reduction.type = "pca", + dims.use = dims.use, + genes.use = genes.use + ) + group.object <- CalcProjectedVar( + object = group.object, + low.dim.data = ldp.pca, + reduction.type = "pca", + dims.use = dims.use, + genes.use = genes.use + ) + group.var.ratio <- group.object@meta.data[, "cca.var", drop = FALSE] / + group.object@meta.data[, "pca.var", drop = FALSE] + } else if (reduction.type == "ica") { + group.object <- RunICA( + object = group.object, + ic.genes = genes.use, + print.results = FALSE + ) + ldp.ica <- CalcLDProj( + object = group.object, + reduction.type = "ica", + dims.use = dims.use, + genes.use = genes.use + ) + group.object <- CalcProjectedVar( + object = group.object, + low.dim.data = ldp.ica, + reduction.type = "ica", + dims.use = dims.use, + genes.use = genes.use + ) + group.var.ratio <- group.object@meta.data[, "cca.var", drop = FALSE] / + group.object@meta.data[, "ica.var", drop = FALSE] + } else { + stop(paste("reduction.type", reduction.type, "not supported")) + } + var.ratio <- rbind(var.ratio, group.var.ratio) + } + var.ratio$cell.name <- rownames(x = var.ratio) + eval(expr = parse(text = paste0( + "object@meta.data$var.ratio.", + reduction.type, + "<- NULL" + ))) + colnames(x = var.ratio) <- c( + paste0("var.ratio.", reduction.type), + "cell.name" + ) + object@meta.data$cell.name <- rownames(x = object@meta.data) + object@meta.data <- merge(x = object@meta.data, y = var.ratio, by = "cell.name") + rownames(x = object@meta.data) <- object@meta.data$cell.name + object@meta.data$cell.name <- NULL + return(object) +} + +#' Align subspaces using dynamic time warping (DTW) +#' +#' Aligns subspaces so that they line up across grouping variable (only +#' implemented for case with 2 categories in grouping.var) +#' +#' +#' @param object Seurat object +#' @param reduction.type reduction to align scores for +#' @param grouping.var Name of the grouping variable for which to align the scores +#' @param dims.align Dims to align, default is all +#' @param num.genes Number of genes to use in construction of "metagene" +#' @param show.plots show debugging plots +#' +#' @return Returns Seurat object with the dims aligned, stored in +#' object@@dr$reduction.type.aligned +#' +#' @importFrom dtw dtw +#' @importFrom pbapply pbapply +#' +#' @export +#' +AlignSubspace <- function( + object, + reduction.type, + grouping.var, + dims.align, + num.genes = 30, + show.plots = FALSE +) { + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("AlignSubspace"))] + object <- SetCalcParams(object = object, + calculation = paste0("AlignSubspace.", reduction.type), + ... = parameters.to.store) + ident.orig <- object@ident + object <- SetAllIdent(object = object, id = grouping.var) + levels.split <- names(x = sort(x = table(object@ident))) + if (length(x = levels.split) != 2) { + stop(paste0( + "There are not two options for ", + grouping.var, + ". \n Current groups include: ", + paste(levels.split, collapse = ", ") + )) + } + objects <- list( + SubsetData(object = object, ident.use = levels.split[1]), + SubsetData(object = object, ident.use = levels.split[2]) + ) + object@ident <- ident.orig + cc.loadings <- list() + scaled.data <- list() + cc.embeds <- list() + for (i in 1:2) { + cat(paste0("Rescaling group ", i, "\n"), file = stderr()) + objects[[i]] <- ScaleData(object = objects[[i]]) + objects[[i]]@scale.data[is.na(x = objects[[i]]@scale.data)] <- 0 + objects[[i]] <- ProjectDim( + object = objects[[i]], + reduction.type = reduction.type, + do.print = FALSE + ) + cc.loadings[[i]] <- GetGeneLoadings( + object = objects[[i]], + reduction.type = reduction.type, + use.full = TRUE + ) + cc.embeds[[i]] <- GetCellEmbeddings( + object = objects[[i]], + reduction.type = reduction.type + ) + scaled.data[[i]] <- objects[[i]]@scale.data + } + cc.embeds.both <- GetCellEmbeddings(object = object, reduction.type = reduction.type) + colnames(cc.embeds.both) <- paste0("A", colnames(x = cc.embeds.both)) + cc.embeds.orig <- cc.embeds.both + for (cc.use in dims.align) { + cat(paste0("Aligning dimension ", cc.use, "\n"), file = stderr()) + genes.rank <- data.frame( + rank(x = abs(x = cc.loadings[[1]][, cc.use])), + rank(x = abs(x = cc.loadings[[2]][, cc.use])), + cc.loadings[[1]][, cc.use], + cc.loadings[[2]][, cc.use] + ) + genes.rank$min <- apply(X = genes.rank[,1:2], MARGIN = 1, FUN = min) + genes.rank <- genes.rank[order(genes.rank$min, decreasing = TRUE), ] + genes.top <- rownames(x = genes.rank)[1:200] + bicors <- list() + for (i in 1:2) { + cc.vals <- cc.embeds[[i]][, cc.use] + bicors[[i]] <- pbsapply( + X = genes.top, + FUN = function(x) { + return(BiweightMidcor(x = cc.vals, y = scaled.data[[i]][x, ])) + } + ) + } + genes.rank <- data.frame( + rank(x = abs(x = bicors[[1]])), + rank(x = abs(x = bicors[[2]])), + bicors[[1]], + bicors[[2]] + ) + genes.rank$min <- apply(X = abs(x = genes.rank[, 1:2]), MARGIN = 1, FUN = min) + genes.rank <- genes.rank[order(genes.rank$min, decreasing = TRUE), ] + genes.use <- rownames(x = genes.rank)[1:num.genes] + metagenes <- list() + multvar.data <- list() + for (i in 1:2) { + scaled.use <- sweep( + x = scaled.data[[i]][genes.use, ], + MARGIN = 1, + STATS = sign(x = genes.rank[genes.use, i + 2]), + FUN = "*" + ) + scaled.use <- scaled.use[, names(x = sort(x = cc.embeds[[i]][, cc.use]))] + metagenes[[i]] <- apply( + X = scaled.use[genes.use, ], + MARGIN = 2, + FUN = mean, + remove.na = TRUE + ) + metagenes[[i]] <- ( + cc.loadings[[i]][genes.use, cc.use] %*% scaled.data[[i]][genes.use, ] + )[1, colnames(x = scaled.use)] + } + + mean.difference <- mean(x = ReferenceRange(x = metagenes[[1]])) - + mean(x = ReferenceRange(x = metagenes[[2]])) + metric.use <- "Euclidean" + align.1 <- ReferenceRange(x = metagenes[[1]]) + align.2 <- ReferenceRange(x = metagenes[[2]]) + a1q <- sapply( + X = seq(from = 0, to = 1, by = 0.001), + FUN = function(x) { + return(quantile(x = align.1, probs = x)) + } + ) + a2q <- sapply( + X = seq(from = 0, to = 1, by = 0.001), + FUN = function(x) { + quantile(x = align.2, probs = x) + } + ) + iqr <- (a1q - a2q)[100:900] + iqr.x <- which.min(x = abs(x = iqr)) + iqrmin <- iqr[iqr.x] + if (show.plots) { + print(iqrmin) + } + align.2 <- align.2 + iqrmin + alignment <- dtw( + x = align.1, + y = align.2, + keep = TRUE, + dist.method = metric.use + ) + alignment.map <- data.frame(alignment$index1, alignment$index2) + alignment.map$cc_data1 <- sort(cc.embeds[[1]][, cc.use])[alignment$index1] + alignment.map$cc_data2 <- sort(cc.embeds[[2]][, cc.use])[alignment$index2] + alignment.map.orig <- alignment.map + alignment.map <- alignment.map[! duplicated(x = alignment.map$alignment.index1), ] + cc.embeds.both[names(x = sort(x = cc.embeds[[1]][, cc.use])), cc.use] <- alignment.map$cc_data2 + if (show.plots) { + par(mfrow = c(3, 2)) + plot(x = ReferenceRange(x = metagenes[[1]]), main = cc.use) + plot(x = ReferenceRange(x = metagenes[[2]])) + plot( + x = ReferenceRange(x = metagenes[[1]])[(alignment.map.orig$alignment.index1)], + pch = 16 + ) + points( + x = ReferenceRange(metagenes[[2]])[(alignment.map.orig$alignment.index2)], + col = "red", + pch = 16, + cex = 0.4 + ) + plot(x = density(x = alignment.map$cc_data2)) + lines(x = density(x = sort(x = cc.embeds[[2]][, cc.use])), col = "red") + plot(x = alignment.map.orig$cc_data1) + points(x = alignment.map.orig$cc_data2, col = "red") + } + } + new.type <- paste0(reduction.type, ".aligned") + new.key <- paste0( + "A", + GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "key" + ) + ) + object <- SetDimReduction( + object = object, + reduction.type = new.type, + slot = "cell.embeddings", + new.data = scale(x = cc.embeds.both) + ) + object <- SetDimReduction( + object = object, + reduction.type = new.type, + slot = "key", + new.data = new.key + ) + return(object) +} + +#' Run diffusion map +#' +#' @param object Seurat object +#' @param cells.use Which cells to analyze (default, all cells) +#' @param dims.use Which dimensions to use as input features +#' @param genes.use If set, run the diffusion map procedure on this subset of +#' genes (instead of running on a set of reduced dimensions). Not set (NULL) by +#' default +#' @param reduction.use Which dimensional reduction (PCA or ICA) to use for the +#' diffusion map. Default is PCA +#' @param q.use Quantile to use +#' @param max.dim Max dimension to keep from diffusion calculation +#' @param scale.clip Max/min value for scaled data. Default is 3 +#' @param ... Additional arguments to the diffuse call +#' +#' @return Returns a Seurat object with a diffusion map +#' +#' @import diffusionMap +#' +#' @export +#' +RunDiffusion <- function( + object, + cells.use = NULL, + dims.use = 1:5, + genes.use = NULL, + reduction.use = 'pca', + q.use = 0.01, + max.dim = 2, + scale.clip = 10, + ... +) { + cells.use <- SetIfNull(x = cells.use, default = colnames(x = object@data)) + if (is.null(x = genes.use)) { + dim.code <- GetDimReduction( + object = object, + reduction.type = reduction.use, + slot = 'key' + ) + dim.codes <- paste0(dim.code, dims.use) + data.use <- FetchData(object = object, vars.all = dim.codes) + } + if (! is.null(x = genes.use)) { + genes.use <- intersect(x = genes.use, y = rownames(x = object@scale.data)) + data.use <- MinMax( + data = t(x = object@data[genes.use, cells.use]), + min = -1 * scale.clip, + max = scale.clip + ) + } + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("RunDiffusion"))] + object <- SetCalcParams(object = object, + calculation = "RunDiffusion", + ... = parameters.to.store) + data.dist <- dist(data.use) + data.diffusion <- data.frame( + diffuse( # Where is diffuse? + D = data.dist, + neigen = max.dim, + maxdim = max.dim, + ... + )$X + ) + colnames(x = data.diffusion) <- paste0("DM", 1:ncol(x = data.diffusion)) + rownames(x = data.diffusion) <- cells.use + for (i in 1:max.dim) { + x <- data.diffusion[,i] + x <- MinMax( + data = x, + min = quantile(x = x, probs = q.use), + quantile(x = x, probs = 1-q.use) + ) + data.diffusion[, i] <- x + } + object <- SetDimReduction( + object = object, + reduction.type = "dm", + slot = "cell.embeddings", + new.data = as.matrix(x = data.diffusion) + ) + object <- SetDimReduction( + object = object, + reduction.type = "dm", + slot = "key", + new.data = "DM" + ) + return(object) +} diff --git a/R/dimensional_reduction_internal.R b/R/dimensional_reduction_internal.R new file mode 100644 index 000000000..40ffd9ae2 --- /dev/null +++ b/R/dimensional_reduction_internal.R @@ -0,0 +1,222 @@ +#' @include seurat.R +NULL +# Set up dim.reduction class + +dim.reduction <- setClass( + Class = "dim.reduction", + slots = list( + cell.embeddings = "matrix", + gene.loadings = "matrix", + gene.loadings.full = "matrix", + sdev = "numeric", + key = "character", + jackstraw="ANY", + misc = "ANY" + ) +) + +# Prep data for dimensional reduction +# +# Common checks and preparatory steps before running certain dimensional +# reduction techniques +# +# @param object Seurat object +# @param genes.use Genes to use as input for the dimensional reduction technique. +# Default is object@@var.genes +# @param dims.compute Number of dimensions to compute +# @param use.imputed Whether to run the dimensional reduction on imputed values +# @param assay.type Assay to scale data for. Default is RNA. Can be changed for multimodal analysis + +PrepDR <- function( + object, + genes.use = NULL, + use.imputed = FALSE, + assay.type="RNA" +) { + + if (length(object@var.genes) == 0 && is.null(x = genes.use)) { + stop("Variable genes haven't been set. Run MeanVarPlot() or provide a vector + of genes names in genes.use and retry.") + } + if (use.imputed) { + data.use <- t(x = scale(x = t(x = object@imputed))) + } else { + data.use <- GetAssayData(object, assay.type = assay.type,slot = "scale.data") + } + genes.use <- SetIfNull(x = genes.use, default = object@var.genes) + genes.use <- unique(x = genes.use[genes.use %in% rownames(x = data.use)]) + genes.var <- apply(X = data.use[genes.use, ], MARGIN = 1, FUN = var) + genes.use <- genes.use[genes.var > 0] + genes.use <- genes.use[! is.na(x = genes.use)] + data.use <- data.use[genes.use, ] + return(data.use) +} + +# Get the top genes associated with given dimensional reduction scores +# +# @param i Dimension for which to pull genes +# @param dim.scores Matrix containing the dimensional reduction scores +# @param do.balanced Whether to pull genes associated with both large and small +# scores (+/-) +# @param num.genes Number of genes to return + +GetTopGenes <- function( + i, + dim.scores, + do.balanced = FALSE, + num.genes = 30 +) { + if (do.balanced) { + num.genes <- round(x = num.genes / 2) + sx <- dim.scores[order(dim.scores[, i]), , drop = FALSE] + genes.1 <- (rownames(x = sx[1:num.genes, , drop = FALSE])) + genes.2 <- (rownames(x = sx[(nrow(x = sx) - num.genes + 1):nrow(x = sx), , drop = FALSE])) + return(c(genes.1, genes.2)) + } else { + sx <- dim.scores[rev(x = order(abs(x = dim.scores[, i]))), ,drop = FALSE] + genes.1 <- (rownames(x = sx[1:num.genes, , drop = FALSE])) + genes.1 <- genes.1[order(dim.scores[genes.1, i])] + return(genes.1) + } +} + +# Check group exists either as an ident or that all cells passed as vector are +# present +# +# @param object Seurat object +# @param group Identity or vector of cell names +# @param group.id Corresponds to the the either group1 or group2 parameter from +# RunCCA + +CheckGroup <- function(object, group, group.id) { + if (all(group %in% unique(x = object@ident))) { + cells.use <- WhichCells(object = object, ident = group) + } else { + if (all(group %in% object@cell.names)) { + cells.use <- group + } else { + stop(paste( + group.id, + "must be either a vector of valid cell names or idents" + )) + } + } + return(cells.use) +} + +# Check that genes have non-zero variance +# +# @param data.use Gene expression matrix (genes are rows) +# @param genes.use Genes in expression matrix to check +# +# @return Returns a vector of genes that is the subset of genes.use +# that have non-zero variance +# +CheckGenes <- function(data.use, genes.use) { + genes.var <- apply(X = data.use[genes.use, ], MARGIN = 1, FUN = var) + genes.use <- genes.use[genes.var > 0] + genes.use <- genes.use[! is.na(x = genes.use)] + return(genes.use) +} + +# Run the diagonal canonical correlation procedure +# +# @param mat1 First matrix +# @param mat2 Second matrix +# @param standardize Standardize matrices - scales columns to have unit +# variance and mean 0 +# @param k Number of canonical correlation vectors (CCs) to calculate +# +# @return Returns the canonical correlation vectors - corresponding +# to the left and right singular vectors after SVD - as well +# as the singular values. +# +CanonCor <- function(mat1, mat2, standardize = TRUE, k = 20) { + set.seed(seed = 42) + if (standardize) { + mat1 <- Standardize(mat = mat1, display_progress = FALSE) + mat2 <- Standardize(mat = mat2, display_progress = FALSE) + } + mat3 <- FastMatMult(m1 = t(x = mat1), m2 = mat2) + cca.svd <- irlba(A = mat3, nv = k) + return(list(u = cca.svd$u, v = cca.svd$v, d = cca.svd$d)) +} + +# Calculate percent variance explained +# +# Projects dataset onto the orthonormal space defined by some dimensional +# reduction technique (e.g. PCA, CCA) and calculates the percent of the +# variance in gene expression explained by each cell in that lower dimensional +# space. +# +# @param object Seurat object +# @param reduction.type Name of the reduction to use for the projection +# @param dims.use Vector of dimensions to project onto (default is the +# 1:number stored for given technique) +# @param genes.use vector of genes to use in calculation +# +# @return Returns a Seurat object wih the variance in gene +# expression explained by each cell in a low dimensional +# space stored as metadata. +# +CalcProjectedVar <- function( + object, + low.dim.data, + reduction.type = "pca", + dims.use, + genes.use +) { + if (missing(x = low.dim.data)) { + low.dim.data <- CalcLDProj( + object = object, + reduction.type = reduction.type, + dims.use = dims.use, + genes.use = genes.use + ) + } + projected.var <- apply(X = low.dim.data, MARGIN = 2, FUN = var) + calc.name <- paste0(reduction.type, ".var") + object <- AddMetaData( + object = object, + metadata = projected.var, + col.name = calc.name + ) + return(object) +} + +# Calculate a low dimensional projection of the data. First forms an orthonormal +# basis of the gene loadings via QR decomposition, projects the data onto that +# basis, and reconstructs the data using on the dimensions specified. +# +# @param object Seurat object +# @param reduction.type Type of dimensional reduction to use +# @param dims.use Dimensions to use in calculation +# @param genes.use Genes to consider when calculating +# +# @return Returns a matrix with the low dimensional reconstruction +# +CalcLDProj <- function(object, reduction.type, dims.use, genes.use) { + if (missing(x = dims.use)){ + dims.use <- 1:ncol(x = GetCellEmbeddings( + object = object, + reduction.type = reduction.type + )) + } + x.vec <- GetGeneLoadings( + object = object, + reduction.type = reduction.type, + dims.use = dims.use + )[genes.use, ] + # form orthonormal basis via QR + x.norm <- qr.Q(qr = qr(x = x.vec)) + if (missing(x = genes.use)) { + genes.use <- rownames(x = x.vec) + } + data.use <- object@scale.data[genes.use, ] + # project data onto othronormal basis + projected.data <- t(x = data.use) %*% x.norm + # reconstruct data using only dims specified + low.dim.data <- x.norm %*% t(x = projected.data) + return(low.dim.data) +} + diff --git a/R/dimensional_reduction_utilities.R b/R/dimensional_reduction_utilities.R new file mode 100644 index 000000000..f9156241f --- /dev/null +++ b/R/dimensional_reduction_utilities.R @@ -0,0 +1,709 @@ +######################## Accessor/Mutator Functions ############################ + +#' Dimensional Reduction Accessor Function +#' +#' General accessor function for dimensional reduction objects. Pulls slot +#' contents for specified stored dimensional reduction analysis. +#' +#' @param object Seurat object +#' @param reduction.type Type of dimensional reduction to fetch (default is PCA) +#' @param slot Specific information to pull (must be one of the following: +#' "cell.embeddings", "gene.loadings", "gene.loadings.full", "sdev", "key", "misc") +#' +#' @return Returns specified slot results from given reduction technique +#' +#' @export +#' +GetDimReduction <- function( + object, + reduction.type = "pca", + slot = "gene.loadings" +) { + if (! (reduction.type %in% names(object@dr))) { + stop(paste(reduction.type, " dimensional reduction has not been computed")) + } + reduction <- paste0("object@dr$", reduction.type) + reduction.slots <- slotNames(x = eval(expr = parse(text = reduction))) + if (! (slot %in% reduction.slots)) { + stop(paste0(slot, " slot doesn't exist")) + } + return(eval(expr = parse(text = paste0(reduction, "@", slot)))) +} + +#' Dimensional Reduction Cell Embeddings Accessor Function +#' +#' Pull cell embeddings matrix for specified stored dimensional reduction +#' analysis +#' +#' @param object Seurat object +#' @param reduction.type Type of dimensional reduction to fetch (default is PCA) +#' @param dims.use Dimensions to include (default is all stored dims) +#' @param cells.use Cells to include (default is all cells) +#' +#' @return Cell embedding matrix for given reduction, cells, and dimensions +#' +#' @export +#' +GetCellEmbeddings <- function( + object, + reduction.type = "pca", + dims.use = NULL, + cells.use = NULL +) { + object.embed <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "cell.embeddings" + ) + if (length(x = object.embed) == 0) { + stop(paste0("Cell embeddings slot for ", reduction.type, " is empty.")) + } + cells.use <- SetIfNull(x = cells.use, default = rownames(x = object.embed)) + if (any(! cells.use %in% rownames(x = object.embed))) { + missing.cells <- paste0( + cells.use[which(x = ! cells.use %in% rownames(x = object.embed))], + collapse = ", " + ) + warning(paste0("Could not find the following cell names: ", missing.cells)) + cells.use <- intersect(x = cells.use, y = rownames(x = object.embed)) + } + dims.use <- SetIfNull(x = dims.use, default = 1:ncol(x = object.embed)) + if (any(!dims.use %in% 1:ncol(x = object.embed))) { + missing.dims <- paste0( + dims.use[which(x = ! dims.use %in% 1:ncol(x = object.embed))], + collapse = ", " + ) + stop(paste0("Could not find the following dimensions: ", missing.dims)) + } + object.embed <- object.embed[cells.use, dims.use, drop = FALSE] + object.key <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "key" + ) + if (length(x = object.key) == 0) { + colnames(x = object.embed) <- NULL + } else { + colnames(x = object.embed) <- paste0(object.key, dims.use) + } + return(object.embed) +} + +#' Dimensional Reduction Gene Loadings Accessor Function +#' +#' Pull gene loadings matrix for specified stored dimensional reduction analysis. +#' +#' @param object Seurat object +#' @param reduction.type Type of dimensional reduction to fetch (default is PCA) +#' @param dims.use Dimensions to include (default is all stored dims) +#' @param genes.use Genes to include (default is all genes) +#' @param use.full Return projected gene loadings (default is FALSE) +#' @return Gene loading matrix for given reduction, cells, and genes +#' @export +GetGeneLoadings <- function( + object, + reduction.type = "pca", + dims.use = NULL, + genes.use = NULL, + use.full = FALSE +) { + if (use.full) { + gene.loadings <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "gene.loadings.full" + ) + } else { + gene.loadings <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "gene.loadings" + ) + } + if (length(x = gene.loadings) == 0) { + stop(paste("gene loadings slot for", reduction.type, "is empty.")) + } + genes.use <- SetIfNull(x = genes.use, default = rownames(x = gene.loadings)) + if (any(! genes.use %in% rownames(x = gene.loadings))) { + missing.genes <- paste0( + genes.use[which(x = ! genes.use %in% rownames(x = gene.loadings))], + collapse = ", " + ) + warning(paste("Could not find the following gene names:", missing.genes)) + genes.use <- intersect(x = genes.use, y = rownames(x = gene.loadings)) + } + dims.use <- SetIfNull(x = dims.use, default = 1:ncol(x = gene.loadings)) + if (any(! dims.use %in% 1:ncol(x = gene.loadings))) { + missing.dims <- paste0( + dims.use[which(x = ! dims.use %in% 1:ncol(x = gene.loadings))], + collapse = ", " + ) + stop(paste("Could not find the following dimensions:", missing.dims)) + } + gene.loadings <- gene.loadings[genes.use, dims.use, drop = FALSE] + object.key <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "key" + ) + if (length(x = object.key) == 0) { + colnames(x = gene.loadings) <- NULL + } else { + colnames(x = gene.loadings) <- paste0(object.key, dims.use) + } + return(gene.loadings) +} + +#' Dimensional Reduction Mutator Function +#' +#' Set information for specified stored dimensional reduction analysis +#' +#' @param object Seurat object +#' @param reduction.type Type of dimensional reduction to set +#' @param slot Specific information to set (must be one of the following: +#' "cell.embeddings", "gene.loadings", "gene.loadings.full", "sdev", "key", +#' "misc") +#' @param new.data New data to set +#' @return Seurat object with updated slot +#' @export +SetDimReduction <- function( + object, + reduction.type, + slot, + new.data +) { + if (reduction.type %in% names(x = object@dr)) { + eval(expr = parse(text = paste0( + "object@dr$", + reduction.type, + "@", + slot, + "<- new.data" + ))) + } else { + new.dr <- new(Class = "dim.reduction") + eval(expr = parse(text = paste0("new.dr@", slot, "<- new.data"))) + eval(expr = parse(text = paste0("object@dr$", reduction.type, "<- new.dr"))) + } + return(object) +} + +################### Convienence functions for easy interaction ################# + +#' Diffusion Maps Cell Embeddings Accessor Function +#' +#' Pull Diffusion maps cell embedding matrix +#' +#' @param object Seurat object +#' @param dims.use Dimensions to include (default is all stored dims) +#' @param cells.use Cells to include (default is all cells) +#' +#' @return Diffusion maps embedding matrix for given cells and DMs +#' +#' @export +#' +DMEmbed <- function( + object, + dims.use = NULL, + cells.use = NULL +) { + return(GetCellEmbeddings( + object = object, + reduction.type = "dm", + dims.use = dims.use, + cells.use = cells.use + )) +} + +#' PCA Cell Embeddings Accessor Function +#' +#' Pull PCA cell embedding matrix +#' +#' @param object Seurat object +#' @param dims.use Dimensions to include (default is all stored dims) +#' @param cells.use Cells to include (default is all cells) +#' +#' @return PCA cell embedding matrix for given cells and PCs +#' +#' @export +#' +PCAEmbed <- function( + object, + dims.use = NULL, + cells.use = NULL +) { + return(GetCellEmbeddings( + object = object, + reduction.type = "pca", + dims.use = dims.use, + cells.use = cells.use + )) +} + +#' ICA Cell Embeddings Accessor Function +#' +#' Pull ICA cell embeddings matrix +#' +#' @param object Seurat object +#' @param dims.use Dimensions to include (default is all stored dims) +#' @param cells.use Cells to include (default is all cells) +#' +#' @return ICA cell embeddings matrix for given cells and ICs +#' +#' @export +#' +ICAEmbed <- function( + object, + dims.use = NULL, + cells.use = NULL +) { + return(GetCellEmbeddings( + object = object, + reduction.type = "ica", + dims.use = dims.use, + cells.use = cells.use + )) +} + +#' PCA Gene Loadings Accessor Function +#' +#' Pull the PCA gene loadings matrix +#' +#' @param object Seurat object +#' @param dims.use Dimensions to include (default is all stored dims) +#' @param genes.use Genes to include (default is all genes) +#' +#' @return PCA gene loading matrix for given genes and PCs +#' +#' @export +#' +PCALoad <- function( + object, + dims.use = NULL, + genes.use = NULL, + use.full = FALSE +) { + return(GetGeneLoadings( + object = object, + reduction.type = "pca", + dims.use = dims.use, + genes.use = genes.use, + use.full = use.full + )) +} + +#' ICA Gene Loadings Accessor Function +#' +#' Pull the ICA gene loadings matrix +#' +#' @param object Seurat object +#' @param dims.use Dimensions to include (default is all stored dims) +#' @param genes.use Genes to include (default is all) +#' +#' @return ICA gene loading matrix for given genes and ICs +#' +#' @export +#' +ICALoad <- function( + object, + dims.use = NULL, + genes.use = NULL, + use.full = FALSE +) { + return(GetGeneLoadings( + object = object, + reduction.type = "ica", + dims.use = dims.use, + genes.use = genes.use, + use.full = use.full + )) +} + +#' Diffusion Maps Gene Loading Accessor Function +#' +#' Pull the diffusion maps gene loadings matrix +#' +#' @param object Seurat object +#' @param dims.use Dimensions to include (default is all stored dims) +#' @param genes.use Genes to include (default is all) +#' @param use.full Return projected gene loadings (default is FALSE)#' +#' @return Diffusion maps gene loading matrix for given genes and DMs +#' +#' @export +#' +DMLoad <- function( + object, + dims.use = NULL, + genes.use = NULL, + use.full = FALSE +) { + return(GetGeneLoadings( + object = object, + reduction.type = "dm", + dims.use = dims.use, + genes.use = genes.use, + use.full = use.full + )) +} + +################### Top Genes/Cells Related Functions ########################## + +#' Find genes with highest scores for a given dimensional reduction technique +#' +#' Return a list of genes with the strongest contribution to a set of components +#' +#' @param object Seurat object +#' @param reduction.type Dimensional reduction to find the highest score for +#' @param pc.use Components to use +#' @param num.genes Number of genes to return +#' @param use.full Use the full PCA (projected PCA). Default i s FALSE +#' @param do.balanced Return an equal number of genes with both + and - scores. +#' +#' @return Returns a vector of genes +#' +#' @export +#' +DimTopGenes <- function( + object, + dim.use = 1, + reduction.type = "pca", + num.genes = 30, + use.full = FALSE, + do.balanced = FALSE +) { + #note that we use GetTopGenes, but it still works + #error checking + if (! reduction.type %in% names(x = object@dr)) { + stop(paste(reduction.type, "dimensional reduction has not been computed")) + } + dim.scores <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "gene.loadings" + ) + if (use.full) { + dim.scores <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "gene.loadings.full" + ) + } + if ((is.null(x = dim.scores)) || (ncol(x = dim.scores) < 2)) { + stop(paste0( + "Gene loadings for ", + reduction.type, + " with use.full=", + use.full, + " have not been computed" + )) + } + i <- dim.use + num.genes <- min(num.genes, length(x = rownames(x = dim.scores))) + key <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "key" + ) + dim.top.genes <- unique(x = unlist(x = lapply( + X = i, + FUN = GetTopGenes, + dim.scores = dim.scores, + do.balanced = do.balanced, + num.genes = num.genes + ))) + return(dim.top.genes) +} + +#' Find genes with highest PCA scores +#' +#' Return a list of genes with the strongest contribution to a set of principal +#' components +#' +#' @param object Seurat object +#' @param pc.use Principal components to use +#' @param num.genes Number of genes to return +#' @param use.full Use the full PCA (projected PCA). Default i s FALSE +#' @param do.balanced Return an equal number of genes with both + and - PC scores. +#' +#' @return Returns a vector of genes +#' +#' @export +#' +PCTopGenes <- function( + object, + pc.use = 1, + num.genes = 30, + use.full = FALSE, + do.balanced = FALSE +) { + return(DimTopGenes( + object = object, + dim.use = pc.use, + reduction.type = "pca", + num.genes = num.genes, + use.full = use.full, + do.balanced = do.balanced + )) +} + +#' Find genes with highest ICA scores +#' +#' Return a list of genes with the strongest contribution to a set of +#' indepdendent components +#' +#' @param object Seurat object +#' @param ic.use Independent components to use +#' @param num.genes Number of genes to return +#' @param do.balanced Return an equal number of genes with both + and - IC scores. +#' +#' @return Returns a vector of genes +#' +#' @export +#' +ICTopGenes <- function( + object, + ic.use = 1, + num.genes = 30, + use.full = FALSE, + do.balanced = FALSE +) { + return(DimTopGenes( + object = object, + dim.use = ic.use, + reduction.type = "ica", + use.full = use.full, + num.genes = num.genes, + do.balanced = do.balanced + )) +} + +#' Find cells with highest scores for a given dimensional reduction technique +#' +#' Return a list of genes with the strongest contribution to a set of components +#' +#' @param object Seurat object +#' @param reduction.type Dimensional reduction to find the highest score for +#' @param dim.use Components to use +#' @param num.cells Number of cells to return +#' @param do.balanced Return an equal number of cells with both + and - scores. +#' +#' @return Returns a vector of cells +#' +#' @export +#' +DimTopCells <- function( + object, + dim.use = 1, + reduction.type = "pca", + num.cells = NULL, + do.balanced = FALSE +) { + #note that we use GetTopGenes, but it still works + #error checking + if (! reduction.type %in% names(x = object@dr)) { + stop(paste(reduction.type, "dimensional reduction has not been computed")) + } + num.cells <- SetIfNull(x = num.cells, default = length(x = object@cell.names)) + dim.scores <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "cell.embeddings" + ) + key <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "key" + ) + i <- dim.use + dim.top.cells <- unique(x = unlist(x = lapply( + X = i, + FUN = GetTopGenes, + dim.scores = dim.scores, + do.balanced = do.balanced, + num.genes = num.cells + ))) + return(dim.top.cells) +} + +#' Find cells with highest PCA scores +#' +#' Return a list of genes with the strongest contribution to a set of principal components +#' +#' @param object Seurat object +#' @param pc.use Principal component to use +#' @param num.cells Number of cells to return +#' @param do.balanced Return an equal number of cells with both + and - PC scores. +#' +#' @return Returns a vector of cells +#' +#' @export +#' +PCTopCells <- function( + object, + pc.use = 1, + num.cells = NULL, + do.balanced = FALSE +) { + return(DimTopCells( + object = object, + dim.use = pc.use, + reduction.type = "pca", + num.cells = num.cells, + do.balanced = do.balanced + )) +} + +#' Find cells with highest ICA scores +#' +#' Return a list of genes with the strongest contribution to a set of principal +#' components +#' +#' @param object Seurat object +#' @param ic.use Independent component to use +#' @param num.cells Number of cells to return +#' @param do.balanced Return an equal number of cells with both + and - PC scores. +#' +#' @return Returns a vector of cells +#' +#' @export +#' +ICTopCells <- function( + object, + ic.use = 1, + num.cells = NULL, + do.balanced = FALSE +) { + return(DimTopCells( + object = object, + dim.use = ic.use, + reduction.type = "ica", + num.cells = num.cells, + do.balanced = do.balanced + )) +} + +##################### Printing results ######################################### + +#' Print the results of a dimensional reduction analysis +#' +#' Prints a set of genes that most strongly define a set of components +#' +#' @param object Seurat object +#' @param reduction.type Reduction technique to print results for +#' @param dims.print Number of dimensions to display +#' @param genes.print Number of genes to display +#' @param use.full Use full PCA (i.e. the projected PCA, by default FALSE) +#' +#' @return Set of genes defining the components +#' +#' @export +#' +PrintDim <- function( + object, + reduction.type = "pca", + dims.print = 1:5, + genes.print = 30, + use.full = FALSE +) { + if (use.full) { + slot.use <- "gene.loadings.full" + } else { + slot.use <- "gene.loadings" + } + dim.scores <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = slot.use + ) + dim.prefix <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "key" + ) + dim.codes.exist <- colnames(x = dim.scores) + dim.codes.input <- paste0(dim.prefix, dims.print) + dims.print <- dims.print[which(x = dim.codes.input %in% dim.codes.exist)] + genes.print <- min(genes.print, nrow(x = dim.scores)) + if (length(x = dim.scores) == 0 && use.full) { + warning("Dimensions have not been projected. Setting use.full = FALSE") + use.full <- FALSE + } + for (i in dims.print) { + code <- paste0( + GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "key" + ), + i + ) + sx <- DimTopGenes( + object = object, + dim.use = i, + reduction.type = reduction.type, + num.genes = genes.print * 2, + use.full = use.full, + do.balanced = TRUE + ) + print(code) + print((sx[1:genes.print])) + print ("") + print(rev(x = (sx[(length(x = sx) - genes.print + 1):length(x = sx)]))) + print ("") + print ("") + } +} + +#' Print the results of a PCA analysis +#' +#' Prints a set of genes that most strongly define a set of principal components +#' +#' @inheritParams VizPCA +#' @param pcs.print Set of PCs to print genes for +#' @param genes.print Number of genes to print for each PC +#' +#' @return Only text output +#' +#' @export +#' +PrintICA <- function( + object, + ics.print = 1:5, + genes.print = 30, + use.full = FALSE +) { + PrintDim( + object = object, + reduction.type = "ica", + dims.print = ics.print, + genes.print = genes.print, + use.full = use.full + ) +} + +#' Print the results of a PCA analysis +#' +#' Prints a set of genes that most strongly define a set of principal components +#' +#' @inheritParams VizPCA +#' @param pcs.print Set of PCs to print genes for +#' @param genes.print Number of genes to print for each PC +#' +#' @return Only text output +#' +#' @export +#' +PrintPCA <- function( + object, + pcs.print = 1:5, + genes.print = 30, + use.full = FALSE +) { + PrintDim( + object = object, + reduction.type = "pca", + dims.print = pcs.print, + genes.print = genes.print, + use.full = use.full + ) +} diff --git a/R/interaction.R b/R/interaction.R new file mode 100644 index 000000000..a28cf9f6f --- /dev/null +++ b/R/interaction.R @@ -0,0 +1,831 @@ +#' Merge Seurat Objects +#' +#' Merge two Seurat objects +#' +#' @param object1 First Seurat object to merge +#' @param object2 Second Seurat object to merge +#' @param min.cells Include genes with detected expression in at least this +#' many cells +#' @param min.genes Include cells where at least this many genes are detected +#' @param is.expr Expression threshold for 'detected' gene +#' @param normalization.method Normalize the data after merging. Default is TRUE. +#' If set, will perform the same normalization strategy as stored for the first object +#' @param do.scale In object@@scale.data, perform row-scaling (gene-based +#' z-score). FALSE by default, so run ScaleData after merging. +#' @param do.center In object@@scale.data, perform row-centering (gene-based +#' centering). FALSE by default +#' @param names.field For the initial identity class for each cell, choose this +#' field from the cell's column name +#' @param names.delim For the initial identity class for each cell, choose this +#' delimiter from the cell's column name +#' @param meta.data Additional metadata to add to the Seurat object. Should be +#' a data frame where the rows are cell names, and the columns are additional +#' metadata fields +#' @param save.raw TRUE by default. If FALSE, do not save the unmodified data in object@@raw.data +#' which will save memory downstream for large datasets +#' @param add.cell.id1 String to be appended to the names of all cells in object1 +#' @param add.cell.id2 String to be appended to the names of all cells in object2 +#' +#' @return Merged Seurat object +#' +#' @import Matrix +#' @importFrom dplyr full_join filter +#' +#' @export +#' +MergeSeurat <- function( + object1, + object2, + project = NULL, + min.cells = 0, + min.genes = 0, + is.expr = 0, + do.normalize=TRUE, + scale.factor = 1e4, + do.scale = FALSE, + do.center = FALSE, + names.field = 1, + names.delim = "_", + save.raw = TRUE, + add.cell.id1 = NULL, + add.cell.id2 = NULL +) { + if (length(x = object1@raw.data) < 2) { + stop("First object provided has an empty raw.data slot. Adding/Merging performed on raw count data.") + } + if (length(x = object2@raw.data) < 2) { + stop("Second object provided has an empty raw.data slot. Adding/Merging performed on raw count data.") + } + if (! missing(add.cell.id1)) { + object1@cell.names <- paste(object1@cell.names, add.cell.id1, sep = ".") + colnames(x = object1@raw.data) <- paste( + colnames(x = object1@raw.data), + add.cell.id1, + sep = "." + ) + rownames(x = object1@meta.data) <- paste( + rownames(x = object1@meta.data), + add.cell.id1, + sep = "." + ) + } + if (! missing(add.cell.id2)) { + object2@cell.names <- paste(object2@cell.names, add.cell.id2, sep = ".") + colnames(x = object2@raw.data) <- paste( + colnames(x = object2@raw.data), + add.cell.id2, + sep = "." + ) + rownames(x = object2@meta.data) <- paste( + rownames(x = object2@meta.data), + add.cell.id2, + sep = "." + ) + } + if (any(object1@cell.names %in% object2@cell.names)) { + warning("Duplicate cell names, enforcing uniqueness via make.unique()") + object2.names <- as.list( + x = make.unique( + names = c( + colnames(x = object1@raw.data), + colnames(x = object2@raw.data) + ) + )[(ncol(x = object1@raw.data) + 1):(ncol(x = object1@raw.data) + ncol(x = object2@raw.data))] + ) + names(x = object2.names) <- colnames(x = object2@raw.data) + colnames(x = object2@raw.data) <- object2.names + object2@cell.names <- unlist( + x = unname( + obj = object2.names[object2@cell.names] + ) + ) + rownames(x = object2@meta.data) <- unlist( + x = unname( + obj = object2.names[rownames(x = object2@meta.data)] + ) + ) + } + merged.raw.data <- RowMergeSparseMatrices( + mat1 = object1@raw.data[,object1@cell.names], + mat2 = object2@raw.data[,object2@cell.names] + ) + object1@meta.data <- object1@meta.data[object1@cell.names, ] + object2@meta.data <- object2@meta.data[object2@cell.names, ] + project <- SetIfNull(x = project, default = object1@project.name) + object1@meta.data$cell.name <- rownames(x = object1@meta.data) + object2@meta.data$cell.name <- rownames(x = object2@meta.data) + merged.meta.data <- suppressMessages( + suppressWarnings( + full_join(x = object1@meta.data, y = object2@meta.data) + ) + ) + merged.object <- CreateSeuratObject( + raw.data = merged.raw.data, + project = project, + min.cells = min.cells, + min.genes = min.genes, + is.expr = is.expr, + normalization.method = NULL, + scale.factor = scale.factor, + do.scale = FALSE, + do.center = FALSE, + names.field = names.field, + names.delim = names.delim, + save.raw = save.raw + ) + + if (do.normalize) { + merged.object <- NormalizeData(object = merged.object, + assay.type = "RNA", + scale.factor = GetCalcParam(object = object1, + calculation = "NormalizeData", + parameter = "scale.factor"), + normalization.method = GetCalcParam(object = object1, + calculation = "NormalizeData", + parameter = "normalization.method")) + } + + if (do.scale | do.center) { + merged.object <- ScaleData(object = merged.object, + do.scale = do.scale, + do.center = do.center) + } + + merged.meta.data %>% filter( + cell.name %in% merged.object@cell.names + ) -> merged.meta.data + rownames(x= merged.meta.data) <- merged.object@cell.names + merged.meta.data$cell.name <- NULL + merged.object@meta.data <- merged.meta.data + return(merged.object) +} + +#' Add samples into existing Seurat object. +#' +#' @param object Seurat object +#' @param project Project name (string) +#' @param new.data Data matrix for samples to be added +#' @param min.cells Include genes with detected expression in at least this +#' many cells +#' @param min.genes Include cells where at least this many genes are detected +#' @param is.expr Expression threshold for 'detected' gene +#' @param normalization.method Normalize the data after merging. Default is TRUE. +#' If set, will perform the same normalization strategy as stored for the first +#' object +#' @param scale.factor scale factor in the log normalization +#' @param do.scale In object@@scale.data, perform row-scaling (gene-based z-score) +#' @param do.center In object@@scale.data, perform row-centering (gene-based +#' centering) +#' @param names.field For the initial identity class for each cell, choose this +#' field from the cell's column name +#' @param names.delim For the initial identity class for each cell, choose this +#' delimiter from the cell's column name +#' @param meta.data Additional metadata to add to the Seurat object. Should be +#' a data frame where the rows are cell names, and the columns are additional +#' metadata fields +#' @param save.raw TRUE by default. If FALSE, do not save the unmodified data in object@@raw.data +#' which will save memory downstream for large datasets +#' @param add.cell.id String to be appended to the names of all cells in new.data. E.g. if add.cell.id = "rep1", +#' "cell1" becomes "cell1.rep1" +#' +#' @import Matrix +#' @importFrom dplyr full_join +#' +#' @export +#' +AddSamples <- function( + object, + new.data, + project = NULL, + min.cells = 3, + min.genes = 1000, + is.expr = 0, + normalization.method = NULL, + scale.factor = 1e4, + do.scale=TRUE, + do.center = TRUE, + names.field = 1, + names.delim = "_", + meta.data = NULL, + save.raw = TRUE, + add.cell.id = NULL +) { + if (length(x = object@raw.data) < 2) { + stop("Object provided has an empty raw.data slot. Adding/Merging performed on raw count data.") + } + if (! missing(x = add.cell.id)) { + colnames(x= new.data) <- paste(colnames(x = new.data), add.cell.id, sep = ".") + } + if (any(colnames(x = new.data) %in% object@cell.names)) { + warning("Duplicate cell names, enforcing uniqueness via make.unique()") + new.data.names <- as.list( + x = make.unique( + names = c( + colnames(x = object@raw.data), + colnames(x = new.data) + ) + )[(ncol(x = object@raw.data) + 1):(ncol(x = object@raw.data) + ncol(x = new.data))] + ) + names(x = new.data.names) <- colnames(x = new.data) + colnames(x = new.data) <- new.data.names + if (! is.null(x = meta.data)){ + rownames(x = meta.data) <- unlist( + x = unname( + obj = new.data.names[rownames(x = meta.data)] + ) + ) + } + } + combined.data <- RowMergeSparseMatrices( + mat1 = object@raw.data[, object@cell.names], + mat2 = new.data + ) + if (is.null(x = meta.data)) { + filler <- matrix(NA, nrow = ncol(new.data), ncol = ncol(object@meta.data)) + rownames(filler) <- colnames(new.data) + colnames(filler) <- colnames(object@meta.data) + filler <- as.data.frame(filler) + combined.meta.data <- rbind(object@meta.data, filler) + } else { + combined.meta.data <- suppressMessages( + suppressWarnings( + full_join(x = object@meta.data, y = meta.data) + ) + ) + } + project <- SetIfNull(x = project, default = object@project.name) + new.object <- CreateSeuratObject( + raw.data = combined.data, + project = project, + min.cells = min.cells, + min.genes = min.genes, + is.expr = is.expr, + normalization.method = normalization.method, + scale.factor = scale.factor, + do.scale = do.scale, + do.center = do.center, + names.field = names.field, + names.delim = names.delim, + save.raw = save.raw + ) + new.object@meta.data <- combined.meta.data[new.object@cell.names,] + return(new.object) +} + +#' Return a subset of the Seurat object +#' +#' Creates a Seurat object containing only a subset of the cells in the +#' original object. Takes either a list of cells to use as a subset, or a +#' parameter (for example, a gene), to subset on. +#' +#' @param object Seurat object +#' @param cells.use A vector of cell names to use as a subset. If NULL +#' (default), then this list will be computed based on the next three +#' arguments. Otherwise, will return an object consissting only of these cells +#' @param subset.name Parameter to subset on. Eg, the name of a gene, PC1, a +#' column name in object@@meta.data, etc. Any argument that can be retreived +#' using FetchData +#' @param ident.use Create a cell subset based on the provided identity classes +#' @param ident.remove Subtract out cells from these identity classes (used for filtration) +#' @param accept.low Low cutoff for the parameter (default is -Inf) +#' @param accept.high High cutoff for the parameter (default is Inf) +#' @param do.center Recenter the new object@@scale.data +#' @param do.scale Rescale the new object@@scale.data. FALSE by default +#' @param max.cells.per.ident Can be used to downsample the data to a certain max per cell ident. Default is inf. +#' @param random.seed Random seed for downsampling +#' @param \dots Additional arguments to be passed to FetchData (for example, +#' use.imputed=TRUE) +#' +#' @return Returns a Seurat object containing only the relevant subset of cells +#' +#' @export +#' +SubsetData <- function( + object, + cells.use = NULL, + subset.name = NULL, + ident.use = NULL, + ident.remove = NULL, + accept.low = -Inf, + accept.high = Inf, + do.center = FALSE, + do.scale = FALSE, + max.cells.per.ident = Inf, + random.seed = 1, + ... +) { + data.use <- NULL + cells.use <- SetIfNull(x = cells.use, default = object@cell.names) + if (!is.null(x = ident.use)) { + ident.use <- setdiff(ident.use, ident.remove) + cells.use <- WhichCells(object, ident.use) + } + if ((is.null(x = ident.use)) && ! is.null(x = ident.remove)) { + ident.use <- setdiff(unique(object@ident), ident.remove) + cells.use <- WhichCells(object, ident.use) + } + if (! is.null(x = subset.name)) { + data.use <- FetchData(object, subset.name, ...) + if (length(x = data.use) == 0) { + return(object) + } + subset.data <- data.use[, subset.name] + pass.inds <- which(x = (subset.data > accept.low) & (subset.data < accept.high)) + cells.use <- rownames(data.use)[pass.inds] + } + cells.use <- intersect(x = cells.use, y = object@cell.names) + cells.use <- WhichCells( + object = object, + cells.use = cells.use, + max.cells.per.ident = max.cells.per.ident, + random.seed = random.seed + ) + object@data <- object@data[, cells.use] + if(! is.null(x = object@scale.data)) { + if (length(x = colnames(x = object@scale.data) > 0)) { + object@scale.data[, cells.use] + object@scale.data <- object@scale.data[ + complete.cases(object@scale.data), # Row + cells.use # Columns + ] + } + } + if (do.scale) { + object <- ScaleData( + object = object, + do.scale = do.scale, + do.center = do.center + ) + object@scale.data <- object@scale.data[ + complete.cases(object@scale.data), # Row + cells.use # Column + ] + } + object@ident <- drop.levels(x = object@ident[cells.use]) + if (length(x = object@dr) > 0) { + for (i in 1:length(object@dr)) { + if(length(object@dr[[i]]@cell.embeddings) > 0){ + object@dr[[i]]@cell.embeddings <- object@dr[[i]]@cell.embeddings[cells.use, ,drop = FALSE] + } + } + } + # handle multimodal casess + if (! .hasSlot(object = object, name = "assay")) { + object@assay <- list() + } + if (length(object@assay) > 0) { + for(i in 1:length(object@assay)) { + if ((! is.null(x = object@assay[[i]]@raw.data)) && (ncol(x = object@assay[[i]]@raw.data) > 1)) { + object@assay[[i]]@raw.data <- object@assay[[i]]@raw.data[, cells.use] + } + if ((! is.null(x = object@assay[[i]]@data)) && (ncol(x = object@assay[[i]]@data) > 1)) { + object@assay[[i]]@data <- object@assay[[i]]@data[, cells.use] + } + if ((! is.null(x = object@assay[[i]]@scale.data)) && (ncol(x = object@assay[[i]]@scale.data) > 1)) { + object@assay[[i]]@scale.data <- object@assay[[i]]@scale.data[, cells.use] + } + } + } + #object@tsne.rot=object@tsne.rot[cells.use, ] + object@cell.names <- cells.use + # object@gene.scores <- data.frame(object@gene.scores[cells.use,]) + # colnames(x = object@gene.scores)[1] <- "nGene" + # rownames(x = object@gene.scores) <- colnames(x = object@data) + object@meta.data <- data.frame(object@meta.data[cells.use,]) + #object@mix.probs=data.frame(object@mix.probs[cells.use,]); colnames(object@mix.probs)[1]="nGene"; rownames(object@mix.probs)=colnames(object@data) + return(object) +} + +#' Reorder identity classes +#' +#' Re-assigns the identity classes according to the average expression of a particular feature (i.e, gene expression, or PC score) +#' Very useful after clustering, to re-order cells, for example, based on PC scores +#' +#' @param object Seurat object +#' @param feature Feature to reorder on. Default is PC1 +#' @param rev Reverse ordering (default is FALSE) +#' @param aggregate.fxn Function to evaluate each identity class based on (default is mean) +#' @param reorder.numeric Rename all identity classes to be increasing numbers starting from 1 (default is FALSE) +#' @param \dots additional arguemnts (i.e. use.imputed=TRUE) +#' +#' @return A seurat object where the identity have been re-oredered based on the average. +#' +#' @export +#' +ReorderIdent <- function( + object, + feature = "PC1", + rev = FALSE, + aggregate.fxn = mean, + reorder.numeric = FALSE, + ... +) { + ident.use <- object@ident + data.use <- FetchData(object = object, vars.all = feature, ...)[, 1] + revFxn <- Same + if (rev) { + revFxn <- function(x) { + return(max(x) + 1 - x) + } + } + names.sort <- names( + x = revFxn( + sort( + x = tapply( + X = data.use, + INDEX = (ident.use), + FUN = aggregate.fxn + ) + ) + ) + ) + ident.new <- factor(x = ident.use, levels = names.sort, ordered = TRUE) + if (reorder.numeric) { + ident.new <- factor( + x = revFxn( + rank( + tapply( + X = data.use, + INDEX = as.numeric(x = ident.new), + FUN = mean + ) + ) + )[as.numeric(ident.new)], + levels = 1:length(x = levels(x = ident.new)), + ordered = TRUE + ) + } + names(x = ident.new) <- names(x = ident.use) + object@ident <- ident.new + return(object) +} + +#' Access cellular data +#' +#' Retreives data (gene expression, PCA scores, etc, metrics, etc.) for a set +#' of cells in a Seurat object +#' +#' @param object Seurat object +#' @param vars.all List of all variables to fetch +#' @param cells.use Cells to collect data for (default is all cells) +#' @param use.imputed For gene expression, use imputed values. Default is FALSE +#' @param use.scaled For gene expression, use scaled values. Default is FALSE +#' @param use.raw For gene expression, use raw values. Default is FALSE +#' +#' @return A data frame with cells as rows and cellular data as columns +#' +#' @export +#' +FetchData <- function( + object, + vars.all = NULL, + cells.use = NULL, + use.imputed = FALSE, + use.scaled = FALSE, + use.raw = FALSE +) { + cells.use <- SetIfNull(x = cells.use, default = object@cell.names) + data.return <- data.frame(row.names = cells.use) + data.expression <- as.matrix(x = data.frame(row.names = cells.use)) + # if any vars passed are genes, subset expression data + gene.check <- vars.all %in% rownames(object@data) + #data.expression <- matrix() + if (all(gene.check)){ + if (use.imputed) { + data.expression <- object@imputed[vars.all, cells.use,drop = FALSE] + } + if (use.scaled) { + data.expression <- object@scale.data[vars.all, cells.use, drop = FALSE] + } + if (use.raw) { + data.expression <- object@raw.data[vars.all, cells.use, drop = FALSE] + } else { + data.expression <- object@data[vars.all, cells.use, drop = FALSE ] + } + return(t(x = as.matrix(x = data.expression))) + } else if (any(gene.check)) { + if (use.imputed) { + data.expression <- object@imputed[vars.all[gene.check], cells.use, drop = FALSE] + } + if(use.scaled) { + data.expression <- object@scale.data[vars.all[gene.check], cells.use, drop = FALSE] + } + if (use.raw) { + data.expression <- object@raw.data[vars.all[gene.check], cells.use, drop = FALSE] + } else { + data.expression <- object@data[vars.all[gene.check], cells.use, drop = FALSE] + } + data.expression <- t(x = data.expression) + } + #now check for multimodal data + if (length(x = object@assay) > 0) { + data.types <- names(x = object@assay) + slot.use <- "data" + if (use.scaled) { + slot.use <- "scale.data" + } + if (use.raw) { + slot.use <- "raw.data" + } + for (data.type in data.types) { + all_data <- (GetAssayData( + object = object, + assay.type = data.type, + slot = slot.use + )) + genes.include <- intersect(x = vars.all, y = rownames(x = all_data)) + data.expression <- cbind( + data.expression, + t(x = all_data[genes.include, , drop = FALSE]) + ) + } + } + var.options <- c("meta.data", "mix.probs", "gene.scores") + if (length(x = names(x = object@dr)) > 0) { + dr.options <- names(x = object@dr) + dr.names <- paste0("dr$", names(x = object@dr), "@key") + dr.names <- sapply( + X = dr.names, + FUN = function(x) { + return(eval(expr = parse(text = paste0("object@", x)))) + } + ) + names(x = dr.names) <- dr.options + var.options <- c(var.options, dr.names) + } + object@meta.data[,"ident"] <- object@ident[rownames(x = object@meta.data)] + for (my.var in vars.all) { + data.use=data.frame() + if (my.var %in% colnames(data.expression)) { + data.use <- data.expression + } else { + for(i in var.options) { + if (all(unlist(x = strsplit(x = my.var, split = "[0-9]+")) == i)) { + eval( + expr = parse( + text = paste0( + "data.use <- object@dr$", + names(x = var.options[which(i == var.options)]), + "@cell.embeddings" + ) + ) + ) + colnames(x = data.use) <- paste0(i, 1:ncol(x = data.use)) + break + } + } + } + if (my.var %in% colnames(object@meta.data)) { + data.use <- object@meta.data[, my.var, drop = FALSE] + } + if (ncol(x = data.use) == 0) { + stop(paste("Error:", my.var, "not found")) + } + cells.use <- intersect(x = cells.use, y = rownames(x = data.use)) + if (! my.var %in% colnames(x = data.use)) { + stop(paste("Error:", my.var, "not found")) + } + data.add <- data.use[cells.use, my.var] + if (is.null(x = data.add)) { + stop(paste("Error:", my.var, "not found")) + } + data.return <- cbind(data.return, data.add) + } + colnames(x = data.return) <- vars.all + rownames(x = data.return) <- cells.use + return(data.return) +} + +#' FastWhichCells +#' Identify cells matching certain criteria (limited to character values) +#' @param object Seurat object +#' @param group.by Group cells in different ways (for example, orig.ident). Should be a column name in object@meta.data +#' @param subset.value Return cells matching this value +#' @param invert invert cells to return.FALSE by default +#' +#' @export +#' +FastWhichCells <- function(object, group.by, subset.value, invert = FALSE) { + object <- SetAllIdent(object = object, id = group.by) + cells.return <- WhichCells(object = object, ident = subset.value) + if (invert) { + cells.return <- setdiff(x = object@cell.names, y = cells.return) + } + return(cells.return) +} + +#' Identify cells matching certain criteria +#' +#' Returns a list of cells that match a particular set of criteria such as +#' identity class, high/low values for particular PCs, ect.. +#' +#' @param object Seurat object +#' @param ident Identity classes to subset. Default is all identities. +#' @param ident.remove Indentity classes to remove. Default is NULL. +#' @param cells.use Subset of cell names +#' @param subset.name Parameter to subset on. Eg, the name of a gene, PC1, a +#' column name in object@@meta.data, etc. Any argument that can be retreived +#' using FetchData +#' @param accept.low Low cutoff for the parameter (default is -Inf) +#' @param accept.high High cutoff for the parameter (default is Inf) +#' @param accept.value Returns all cells with the subset name equal to this value +#' @param max.cells.per.ident Can be used to downsample the data to a certain max per cell ident. Default is inf. +#' @param random.seed Random seed for downsampling +#' +#' @return A vector of cell names +#' +#' @export +#' +WhichCells <- function( + object, + ident = NULL, + ident.remove = NULL, + cells.use = NULL, + subset.name = NULL, + accept.low = -Inf, + accept.high = Inf, + accept.value = NULL, + max.cells.per.ident = Inf, + random.seed = 1 +) { + set.seed(seed = random.seed) + cells.use <- SetIfNull(x = cells.use, default = object@cell.names) + ident <- SetIfNull(x = ident, default = unique(x = object@ident)) + ident <- setdiff(x = ident, y = ident.remove) + if (! all(ident %in% unique(x = object@ident))) { + bad.idents <- ident[! (ident %in% unique(x = object@ident))] + stop(paste("Identity :", bad.idents, "not found. ")) + } + cells.to.use <- character() + for (id in ident) { + cells.in.ident <- object@ident[cells.use] + cells.in.ident <- names(x = cells.in.ident[cells.in.ident == id]) + cells.in.ident <- cells.in.ident[! is.na(x = cells.in.ident)] + if (length(x = cells.in.ident) > max.cells.per.ident) { + cells.in.ident <- sample(x = cells.in.ident, size = max.cells.per.ident) + } + cells.to.use <- c(cells.to.use, cells.in.ident) + } + cells.use <- cells.to.use + if (! is.null(x = subset.name)){ + subset.name <- as.character(subset.name) + data.use <- FetchData( + object = object, + vars.all = subset.name, + cells.use = cells.use + ) + if (length(x = data.use) == 0) { + stop(paste("Error : ", id, " not found")) + } + subset.data <- data.use[, subset.name, drop = F] + if(! is.null(x = accept.value)) { + pass.inds <- which(x = subset.data == accept.value) + } else { + pass.inds <- which(x = (subset.data >= accept.low) & (subset.data <= accept.high)) + } + cells.use <- rownames(x = data.use)[pass.inds] + } + return(cells.use) +} + +#' Switch identity class definition to another variable +#' +#' @param object Seurat object +#' @param id Variable to switch identity class to (for example, 'DBclust.ident', the output +#' of density clustering) Default is orig.ident - the original annotation pulled from the cell name. +#' +#' @return A Seurat object where object@@ident has been appropriately modified +#' +#' @export +#' +SetAllIdent <- function(object, id = NULL) { + id <- SetIfNull(x = id, default = "orig.ident") + if (id %in% colnames(x = object@meta.data)) { + cells.use <- rownames(x = object@meta.data) + ident.use <- object@meta.data[, id] + object <- SetIdent( + object = object, + cells.use = cells.use, + ident.use = ident.use + ) + } + return(object) +} + +#' Rename one identity class to another +#' +#' Can also be used to join identity classes together (for example, to merge clusters). +#' +#' @param object Seurat object +#' @param old.ident.name The old identity class (to be renamed) +#' @param new.ident.name The new name to apply +#' +#' @return A Seurat object where object@@ident has been appropriately modified +#' +#' @export +#' +RenameIdent <- function(object, old.ident.name = NULL, new.ident.name = NULL) { + if (! old.ident.name %in% object@ident) { + stop(paste("Error : ", old.ident.name, " is not a current identity class")) + } + new.levels <- old.levels <- levels(x = object@ident) + # new.levels <- old.levels + if (new.ident.name %in% old.levels) { + new.levels <- new.levels[new.levels != old.ident.name] + } + if(! (new.ident.name %in% old.levels)) { + new.levels[new.levels == old.ident.name] <- new.ident.name + } + ident.vector <- as.character(x = object@ident) + names(x = ident.vector) <- names(object@ident) + ident.vector[WhichCells(object = object, ident = old.ident.name)] <- new.ident.name + object@ident <- factor(x = ident.vector, levels = new.levels) + return(object) +} + +#' Set identity class information +#' +#' Stashes the identity in data.info to be retrieved later. Useful if, for example, testing multiple clustering parameters +#' +#' @param object Seurat object +#' @param save.name Store current object@@ident under this column name in object@@meta.data. Can be easily retrived with SetAllIdent +#' +#' @return A Seurat object where object@@ident has been appropriately modified +#' +#' @export +#' +StashIdent <- function(object, save.name = "oldIdent") { + object@meta.data[, save.name] <- as.character(x = object@ident) + return(object) +} + +#' Set identity class information +#' +#' Sets the identity class value for a subset (or all) cells +#' +#' @param object Seurat object +#' @param cells.use Vector of cells to set identity class info for (default is +#' all cells) +#' @param ident.use Vector of identity class values to assign (character +#' vector) +#' +#' @return A Seurat object where object@@ident has been appropriately modified +#' +#' @importFrom gdata drop.levels +#' +#' @export +#' +SetIdent <- function(object, cells.use = NULL, ident.use = NULL) { + cells.use <- SetIfNull(x = cells.use, default = object@cell.names) + if (length(x = setdiff(x = cells.use, y = object@cell.names) > 0)) { + stop(paste( + "ERROR : Cannot find cells ", + setdiff(x = cells.use, y = object@cell.names) + )) + } + ident.new <- setdiff(x = ident.use, y = levels(x = object@ident)) + object@ident <- factor( + x = object@ident, + levels = unique( + x = c( + as.character(x = object@ident), + as.character(x = ident.new) + ) + ) + ) + object@ident[cells.use] <- ident.use + object@ident <- drop.levels(x = object@ident) + return(object) +} + +#' Add Metadata +#' +#' Adds additional data for single cells to the Seurat object. Can be any piece +#' of information associated with a cell (examples include read depth, +#' alignment rate, experimental batch, or subpopulation identity). The +#' advantage of adding it to the Seurat object is so that it can be +#' analyzed/visualized using FetchData, VlnPlot, GenePlot, SubsetData, etc. +#' +#' @param object Seurat object +#' @param metadata Data frame where the row names are cell names (note : these +#' must correspond exactly to the items in object@@cell.names), and the columns +#' are additional metadata items. +#' @param col.name Name for metadata if passing in single vector of information +#' +#' @return Seurat object where the additional metadata has been added as +#' columns in object@@meta.data +#' +#' @export +#' +AddMetaData <- function(object, metadata, col.name = NULL) { + if (typeof(x = metadata) != "list") { + metadata <- as.data.frame(x = metadata) + if (is.null(x = col.name)) { + stop("Please provide a name for provided metadata") + } + colnames(x = metadata) <- col.name + } + cols.add <- colnames(x = metadata) + object@meta.data[, cols.add] <- metadata[rownames(x=object@meta.data), cols.add] + return(object) +} diff --git a/R/jackstraw.R b/R/jackstraw.R new file mode 100644 index 000000000..6393d23d1 --- /dev/null +++ b/R/jackstraw.R @@ -0,0 +1,211 @@ +#' Determine statistical significance of PCA scores. +#' +#' Randomly permutes a subset of data, and calculates projected PCA scores for +#' these 'random' genes. Then compares the PCA scores for the 'random' genes +#' with the observed PCA scores to determine statistical signifance. End result +#' is a p-value for each gene's association with each principal component. +#' +#' @param object Seurat object +#' @param num.pc Number of PCs to compute significance for +#' @param num.replicate Number of replicate samplings to perform +#' @param prop.freq Proportion of the data to randomly permute for each +#' replicate +#' @param do.print Print the number of replicates that have been processed. +#' +#' @return Returns a Seurat object where object@@jackStraw.empP represents +#' p-values for each gene in the PCA analysis. If ProjectPCA is subsequently +#' run, object@@jackStraw.empP.full then represents p-values for all genes. +#' +#' @importFrom pbapply pbsapply +#' +#' @references Inspired by Chung et al, Bioinformatics (2014) +#' +#' @export +#' +JackStraw <- function( + object, + num.pc = 20, + num.replicate = 100, + prop.freq = 0.01, + do.print = FALSE +) { + if (is.null(object@dr$pca)) { + stop("PCA has not been computed yet. Please run RunPCA().") + } + # error checking for number of PCs + if (num.pc > ncol(x = GetDimReduction(object,"pca","cell.embeddings"))) { + num.pc <- ncol(x = GetDimReduction(object,"pca","cell.embeddings")) + warning("Number of PCs specified is greater than PCs available. Setting num.pc to ", num.pc, " and continuing.") + } + if (num.pc > length(x = object@cell.names)) { + num.pc <- length(x = object@cell.names) + warning("Number of PCs specified is greater than number of cells. Setting num.pc to ", num.pc, " and continuing.") + } + pc.genes <- rownames(x = GetDimReduction(object,"pca","gene.loadings")) + if (length(x = pc.genes) < 3) { + stop("Too few variable genes") + } + if (length(x = pc.genes) * prop.freq < 3) { + warning( + "Number of variable genes given ", + prop.freq, + " as the prop.freq is low. Consider including more variable genes and/or increasing prop.freq. ", + "Continuing with 3 genes in every random sampling." + ) + } + md.x <- as.matrix(x = GetDimReduction(object,"pca","gene.loadings")) + md.rot <- as.matrix(x = GetDimReduction(object,"pca","cell.embeddings")) + if (do.print) { + applyFunction <- pbsapply + } else { + applyFunction <- sapply + } + rev.pca <- GetCalcParam(object = object, + calculation = "PCA", + parameter = "rev.pca") + weight.by.var <- GetCalcParam(object = object, + calculation = "PCA", + parameter = "weight.by.var") + data.use.scaled <- GetAssayData(object = object, + assay.type = "RNA", + slot = "scale.data")[pc.genes,] + fake.pcVals.raw <- applyFunction( + X = 1:num.replicate, + FUN = function(x) + return(JackRandom( + scaled.data = data.use.scaled, + prop = prop.freq, + r1.use = 1, + r2.use = num.pc, + seed.use = x, + rev.pca = rev.pca, + weight.by.var + )), + simplify = FALSE + ) + fake.pcVals <- sapply( + X = 1:num.pc, + FUN = function(x) { + return(as.numeric(x = unlist(x = lapply( + X = 1:num.replicate, + FUN = function(y) { + return(fake.pcVals.raw[[y]][, x]) + } + )))) + } + ) + jackStraw.fakePC <- as.matrix(fake.pcVals) + jackStraw.empP <- as.matrix( + sapply( + X = 1:num.pc, + FUN = function(x) { + return(unlist(x = lapply( + X = abs(md.x[, x]), + FUN = EmpiricalP, + nullval = abs(fake.pcVals[,x]) + ))) + } + ) + ) + colnames(x = jackStraw.empP) <- paste0("PC", 1:ncol(x = jackStraw.empP)) + + jackstraw.obj <- new( + Class = "jackstraw.data", + emperical.p.value = jackStraw.empP, + fake.pc.scores = fake.pcVals, + emperical.p.value.full = matrix() + ) + object <- SetDimReduction(object = object, + reduction.type = "pca", + slot = "jackstraw", + new.data = jackstraw.obj) + + return(object) +} + +# Documentatin +############## +#' @export +#' +JackRandom <- function( + scaled.data, + prop.use = 0.01, + r1.use = 1, + r2.use = 5, + seed.use = 1, + rev.pca = FALSE, + weight.by.var = weight.by.var +) { + set.seed(seed = seed.use) + rand.genes <- sample( + x = rownames(x = scaled.data), + size = nrow(x = scaled.data) * prop.use + ) + # make sure that rand.genes is at least 3 + if (length(x = rand.genes) < 3){ + rand.genes <- sample(x = rownames(x = scaled.data), size = 3) + } + data.mod <- scaled.data + data.mod[rand.genes, ] <- MatrixRowShuffle(x = scaled.data[rand.genes, ]) + temp.object <- new("seurat") + temp.object@scale.data <- data.mod + temp.object <- RunPCA(temp.object, pcs.compute = r2.use, pc.genes = rownames(data.mod), + rev.pca = rev.pca, weight.by.var = weight.by.var, + do.print = F) + fake.x <- PCALoad(temp.object) + fake.rot <- PCAEmbed(temp.object) + return(fake.x[rand.genes, r1.use:r2.use]) +} + + + +#' Significant genes from a PCA +#' +#' Returns a set of genes, based on the JackStraw analysis, that have +#' statistically significant associations with a set of PCs. +#' +#' @param object Seurat object +#' @param pcs.use PCS to use. +#' @param pval.cut P-value cutoff +#' @param use.full Use the full list of genes (from the projected PCA). Assumes +#' that ProjectPCA has been run. Currently, must be set to FALSE. +#' @param max.per.pc Maximum number of genes to return per PC. Used to avoid genes from one PC dominating the entire analysis. +#' +#' @return A vector of genes whose p-values are statistically significant for +#' at least one of the given PCs. +#' +#' @export +#' +PCASigGenes <- function( + object, + pcs.use, + pval.cut = 0.1, + use.full = FALSE, + max.per.pc = NULL +) { + pvals.use <- GetDimReduction(object,reduction.type = "pca",slot = "jackstraw")@emperical.p.value + pcx.use <- GetDimReduction(object,reduction.type = "pca",slot = "gene.loadings") + if (use.full) { + pvals.use <- GetDimReduction(object,reduction.type = "pca",slot = "jackstraw")@emperical.p.value.full + pcx.use <- GetDimReduction(object,reduction.type = "pca",slot = "gene.loadings.full") + } + if (length(x = pcs.use) == 1) { + pvals.min <- pvals.use[, pcs.use] + } + if (length(x = pcs.use) > 1) { + pvals.min <- apply(X = pvals.use[, pcs.use], MARGIN = 1, FUN = min) + } + names(x = pvals.min) <- rownames(x = pvals.use) + genes.use <- names(x = pvals.min)[pvals.min < pval.cut] + if (! is.null(x = max.per.pc)) { + pc.top.genes <- PCTopGenes( + object = object, + pc.use = pcs.use, + num.genes = max.per.pc, + use.full = use.full, + do.balanced = FALSE + ) + genes.use <- ainb(a = pc.top.genes, b = genes.use) + } + return(genes.use) +} diff --git a/R/jackstraw_internal.R b/R/jackstraw_internal.R new file mode 100644 index 000000000..de8cb5923 --- /dev/null +++ b/R/jackstraw_internal.R @@ -0,0 +1,51 @@ +#define class to store jackstraw data +jackstraw.data <- setClass( + Class = "jackstraw.data", + slots = list( + emperical.p.value = "matrix", + fake.pc.scores = "matrix", + emperical.p.value.full = "matrix" + ) +) + + +#internal +JackstrawF <- function(prop = 0.1, myR1, myR2 = 3, data = smD) { + randGenes <- sample(x = rownames(x = data), size = nrow(x = data) * prop) + smD.mod <- data + smD.mod[randGenes, ] <- MatrixRowShuffle(x = data[randGenes, ]) + fmd.pca <- prcomp(x = smD.mod) + fmd.x <- fmd.pca$x + fmd.rot <- fmd.pca$rotation + fakeF <- unlist(x = lapply( + X = randGenes, + FUN = JackF, + r1 = myR1, + r2 = myR2, + x = fmd.x, + rot = fmd.rot + )) +} + +#internal +JackF <- function(gene, r1 = 1,r2 = 2, x = md.x, rot = md.rot) { + if (r2 == 1) { #assuming r1, r2=1 + mod.x <- x[, r1] + mod.x[gene] <- 0 + return(var.test( + x = (x[, r1] %*% t(x = rot[, r1])), + y = (mod.x %*% t(x = rot[, r1])) + )$statistic) + } + mod.x <- x[, 1:r2] + mod.x[gene, r1:r2] <- rep(x = 0, r2 - r1 + 1) + return(var.test( + x = (x[, 1:r2] %*% t(x = rot[, 1:r2])), + y = (mod.x[, 1:r2] %*% t(x = rot[, 1:r2])) + )$statistic) +} + +#internal +EmpiricalP <- function(x, nullval) { + return(sum(nullval > x) / length(x = nullval)) +} diff --git a/R/multi_modal.R b/R/multi_modal.R new file mode 100644 index 000000000..3c0cdf24f --- /dev/null +++ b/R/multi_modal.R @@ -0,0 +1,92 @@ +#' @include seurat.R +NULL + +# Set up assay class to hold multimodal data sets + +assay <- setClass( + Class = "assay", + slots = list( + raw.data = "ANY", + data = "ANY", + scale.data = "ANY", + key = "character", + misc = "ANY", + var.genes="vector", + mean.var="data.frame" + ) +) + +#' Accessor function for multimodal data +#' +#' Pull information for specified stored dimensional reduction analysis +#' +#' @param object Seurat object +#' @param assay.type Type of assay to fetch data for (default is RNA) +#' @param slot Specific information to pull (i.e. raw.data, data, scale.data,...). Default is data +#' +#' @return Returns assay data +#' +#' @export +#' +GetAssayData <- function(object, assay.type = "RNA", slot = "data") { + if (assay.type == "RNA") { + if (slot == "raw.data") { + to.return <- object@raw.data + } else if (slot == "data") { + to.return <- object@data + } else if (slot == "scale.data") { + if (length(x = object@scale.data) == 0) { + stop("Object@scale.data has not been set. Run ScaleData() and then retry.") + } + to.return <- object@scale.data + } + #note that we check for this to avoid a long subset for large matrices if it can be avoided + if (length(x = object@cell.names) == ncol(to.return)) { + return(to.return) + } + return(to.return[, object@cell.names]) + } + if (! (assay.type %in% names(objectobject@assay))) { + stop(paste(assay.type, "data has not been added")) + } + if (! (slot %in% slotNames(eval(envir = parse(text = paste0("object@assay$", assay.type)))))) { + stop(paste(slot, "slot doesn't exist")) + } + to.return <- (eval(expr = parse(text = paste0("object@assay$", assay.type, "@", slot)))) + if (length(x = object@cell.names) == ncol(x = to.return)) { + return(to.return) + } + return(to.return[, object@cell.names]) +} + +#' Assay Data Mutator Function +#' +#' Store information for specified assay, for multimodal analysis +#' +#' @inheritParams GetAssayData +#' @param new.data New data to insert +#' +#' @return Seurat object with updated slot +#' +#' @export +#' +SetAssayData <- function(object, assay.type, slot, new.data) { + if (assay.type == "RNA") { + if (slot == "raw.data") { + (object@raw.data <- new.data) + } else if (slot == "data") { + (object@data <- new.data) + } else if (slot == "scale.data") { + (object@scale.data <- new.data) + } + return(object) + } + if (assay.type %in% names(objectobject@assay)) { + eval(expr = parse(text = paste0("object@assay$", assay.type, "@", slot, "<- new.data"))) + } else { + new.assay <- new(Class = "assay") + eval(expr = parse(text = paste0("new.assay@", slot, "<- new.data"))) + eval(expr = parse(text = paste0("object@assay$", assay.type, "<- new.assay"))) + } + return(object) +} diff --git a/R/plotting.R b/R/plotting.R new file mode 100644 index 000000000..08eff5e5f --- /dev/null +++ b/R/plotting.R @@ -0,0 +1,2705 @@ +#' Gene expression heatmap +#' +#' Draws a heatmap of single cell gene expression using ggplot2. +#' +#' @param object Seurat object +#' @param data.use Option to pass in data to use in the heatmap. Default will pick from either +#' object@@data or object@@scale.data depending on use.scaled parameter. Should have cells as columns +#' and genes as rows. +#' @param use.scaled Whether to use the data or scaled data if data.use is NULL +#' @param cells.use Cells to include in the heatmap (default is all cells) +#' @param genes.use Genes to include in the heatmap (ordered) +#' @param disp.min Minimum display value (all values below are clipped) +#' @param disp.max Maximum display value (all values above are clipped) +#' @param group.by Groups cells by this variable. Default is object@@ident +#' @param draw.line Draw vertical lines delineating different groups +#' @param col.low Color for lowest expression value +#' @param col.mid Color for mid expression value +#' @param col.high Color for highest expression value +#' @param slim.col.label display only the identity class name once for each group +#' @param remove.key Removes the color key from the plot. +#' @param rotate.key Rotate color scale horizantally +#' @param cex.col Controls size of column labels (cells) +#' @param cex.row Controls size of row labels (genes) +#' @param group.label.loc Place group labels on bottom or top of plot. +#' @param group.label.rot Whether to rotate the group label. +#' @param group.cex Size of group label text +#' @param group.spacing Controls amount of space between columns. +#' @param do.plot Whether to display the plot. +#' @param assay.type Assay to scale data for. Default is RNA. Can be changed for multimodal analysis +#' @return Returns a ggplot2 plot object +#' @importFrom reshape2 melt +#' @importFrom dplyr %>% +#' @export +DoHeatmap <- function( + object, + data.use = NULL, + use.scaled = TRUE, + cells.use = NULL, + genes.use = NULL, + disp.min = -2.5, + disp.max = 2.5, + group.by = "ident", + draw.line = TRUE, + col.low = "#FF00FF", + col.mid = "#000000", + col.high = "#FFFF00", + slim.col.label = FALSE, + remove.key = FALSE, + rotate.key = FALSE, + title = NULL, + cex.col = 10, + cex.row = 10, + group.label.loc = "bottom", + group.label.rot = FALSE, + group.cex = 15, + group.spacing = 0.15, + do.plot = TRUE, + ... +) { + if (is.null(x = data.use)) { + if (use.scaled) { + data.use <- GetAssayData(object,assay.type = "RNA",slot = "scale.data") + } else { + data.use <- GetAssayData(object,assay.type = "RNA",slot = "data") + } + } + # note: data.use should have cells as column names, genes as row names + cells.use <- SetIfNull(x = cells.use, default = object@cell.names) + cells.use <- intersect(x = cells.use, y = colnames(x = data.use)) + if (length(x = cells.use) == 0) { + stop("No cells given to cells.use present in object") + } + genes.use <- SetIfNull(x = genes.use, default = rownames(y = data.use)) + genes.use <- intersect(x = genes.use, y = rownames(x = data.use)) + if (length(x = genes.use) == 0) { + stop("No genes given to genes.use present in object") + } + if (is.null(x = group.by) || group.by == "ident") { + cells.ident <- object@ident[cells.use] + } else { + cells.ident <- factor(x = FetchData( + object = object, + cells.use = cells.use, + vars.all = group.by + )[, 1]) + names(x = cells.ident) <- cells.use + } + cells.ident <- factor( + x = cells.ident, + labels = intersect(x = levels(x = cells.ident), y = cells.ident) + ) + data.use <- data.use[genes.use, cells.use] + if (use.scaled) { + data.use <- MinMax(data = data.use, min = disp.min, max = disp.max) + } + data.use <- as.data.frame(x = t(x = data.use)) + data.use$cell <- rownames(x = data.use) + colnames(x = data.use) <- make.unique(names = colnames(x = data.use)) + data.use %>% melt(id.vars = "cell") -> data.use + names(x = data.use)[names(x = data.use) == 'variable'] <- 'gene' + names(x = data.use)[names(x = data.use) == 'value'] <- 'expression' + data.use$ident <- cells.ident[data.use$cell] + breaks <- seq( + from = min(data.use$expression), + to = max(data.use$expression), + length = length(x = pyCols) + 1 + ) + data.use$gene <- with( + data = data.use, + expr = factor(x = gene, levels = rev(x = unique(x = data.use$gene))) + ) + data.use$cell <- with( + data = data.use, + expr = factor(x = cell, levels = cells.use) + ) + # might be a solution if we want discrete interval units, makes the legend clunky though + #data.use$expression <- cut(data.use$expression, breaks = breaks, include.lowest = T) + #heatmap <- ggplot(data.use, aes(x = cell, y = gene, fill = expression)) + geom_tile() + + # scale_fill_manual(values = pyCols, name= "Expression") + + # scale_y_discrete(position = "right", labels = rev(genes.use)) + + # theme(axis.line=element_blank(), axis.title.y=element_blank(), + # axis.ticks.y = element_blank()) + if (rotate.key) { + key.direction <- "horizontal" + key.title.pos <- "top" + } else { + key.direction <- "vertical" + key.title.pos <- "left" + } + heatmap <- ggplot( + data = data.use, + mapping = aes(x = cell, y = gene, fill = expression) + ) + + geom_tile() + + scale_fill_gradient2( + low = col.low, + mid = col.mid, + high = col.high, + name= "Expression", + guide = guide_colorbar( + direction = key.direction, + title.position = key.title.pos + ) + ) + + scale_y_discrete(position = "right", labels = rev(genes.use)) + + theme( + axis.line = element_blank(), + axis.title.y = element_blank(), + axis.ticks.y = element_blank(), + strip.text.x = element_text(size = group.cex), + axis.text.y = element_text(size = cex.row), + axis.text.x = element_text(size = cex.col), + axis.title.x = element_blank() + ) + if (slim.col.label) { + heatmap <- heatmap + + theme( + axis.title.x = element_blank(), + axis.text.x = element_blank(), + axis.ticks.x = element_blank(), + axis.line = element_blank(), + axis.title.y = element_blank(), + axis.ticks.y = element_blank() + ) + } else { + heatmap <- heatmap + theme(axis.text.x = element_text(angle = 90)) + } + if (! is.null(x = group.by)) { + if (group.label.loc == "top") { + switch <- NULL + # heatmap <- heatmap + + # facet_grid( + # facets = ~ident, + # drop = TRUE, + # space = "free", + # scales = "free" + # ) + + # scale_x_discrete(expand = c(0, 0), drop = TRUE) + } else { + switch <- 'x' + # heatmap <- heatmap + + # facet_grid( + # facets = ~ident, + # drop = TRUE, + # space = "free", + # scales = "free", + # switch = "x" + # ) + + # scale_x_discrete(expand = c(0, 0), drop = TRUE) + } + heatmap <- heatmap + + facet_grid( + facets = ~ident, + drop = TRUE, + space = "free", + scales = "free", + switch = switch, + ) + + scale_x_discrete(expand = c(0, 0), drop = TRUE) + if (draw.line) { + panel.spacing <- unit(x = group.spacing, units = 'lines') + # heatmap <- heatmap + theme(strip.background = element_blank(), panel.spacing = unit(group.spacing, "lines")) + } else { + panel.spacing <- unit(x = 0, units = 'lines') + # + } + heatmap <- heatmap + + theme(strip.background = element_blank(), panel.spacing = panel.spacing) + if (group.label.rot) { + heatmap <- heatmap + theme(strip.text.x = element_text(angle = 90)) + } + } + if (remove.key) { + heatmap <- heatmap + theme(legend.position = "none") + } + if (! is.null(x = title)) { + heatmap <- heatmap + labs(title = title) + } + if (do.plot) { + heatmap + } + return(heatmap) +} + +#' Single cell violin plot +#' +#' Draws a violin plot of single cell data (gene expression, metrics, PC +#' scores, etc.) +#' +#' @param object Seurat object +#' @param features.plot Features to plot (gene expression, metrics, PC scores, +#' anything that can be retreived by FetchData) +#' @param ident.include Which classes to include in the plot (default is all) +#' @param nCol Number of columns if multiple plots are displayed +#' @param do.sort Sort identity classes (on the x-axis) by the average +#' expression of the attribute being potted +#' @param y.max Maximum y axis value +#' @param same.y.lims Set all the y-axis limits to the same values +#' @param size.x.use X axis title font size +#' @param size.y.use Y axis title font size +#' @param size.title.use Main title font size +#' @param adjust.use Adjust parameter for geom_violin +#' @param point.size.use Point size for geom_violin +#' @param cols.use Colors to use for plotting +#' @param group.by Group (color) cells in different ways (for example, orig.ident) +#' @param y.log plot Y axis on log scale +#' @param x.lab.rot Rotate x-axis labels +#' @param y.lab.rot Rotate y-axis labels +#' @param legend.position Position the legend for the plot +#' @param single.legend Consolidate legend the legend for all plots +#' @param remove.legend Remove the legend from the plot +#' @param do.return Return a ggplot2 object (default : FALSE) +#' @param return.plotlist Return the list of individual plots instead of compiled plot. +#' @param \dots additional parameters to pass to FetchData (for example, use.imputed, use.scaled, use.raw) +#' @import ggplot2 +#' @importFrom cowplot plot_grid +#' @return By default, no return, only graphical output. If do.return=TRUE, +#' returns a list of ggplot objects. +#' @export +VlnPlot <- function( + object, + features.plot, + ident.include = NULL, + nCol = NULL, + do.sort = FALSE, + y.max = NULL, + same.y.lims = FALSE, + size.x.use = 16, + size.y.use = 16, + size.title.use = 20, + adjust.use = 1, + point.size.use = 1, + cols.use = NULL, + group.by = NULL, + y.log = FALSE, + x.lab.rot = FALSE, + y.lab.rot = FALSE, + legend.position = "right", + single.legend = TRUE, + remove.legend = FALSE, + do.return = FALSE, + return.plotlist = FALSE, + ... +) { + if (is.null(x = nCol)) { + if (length(x = features.plot) > 9) { + nCol <- 4 + } else { + nCol <- min(length(x = features.plot), 3) + } + } + data.use <- data.frame(FetchData(object = object, vars.all = features.plot, ...)) + if (is.null(x = ident.include)) { + cells.to.include <- object@cell.names + } else { + cells.to.include <- WhichCells(object = object, ident = ident.include) + } + data.use <- data.use[cells.to.include, ,drop = FALSE] + if (!is.null(x = group.by)) { + ident.use <- as.factor(x = FetchData( + object = object, + vars.all = group.by + )[cells.to.include, 1]) + } else { + ident.use <- object@ident[cells.to.include] + } + gene.names <- colnames(x = data.use)[colnames(x = data.use) %in% rownames(x = object@data)] + if (single.legend) { + remove.legend <- TRUE + } + if (same.y.lims && is.null(x = y.max)) { + y.max <- max(data.use) + } + plots <- lapply( + X = features.plot, + FUN = function(x) { + return(SingleVlnPlot( + feature = x, + data = data.use[, x, drop = FALSE], + cell.ident = ident.use, + do.sort = do.sort, y.max = y.max, + size.x.use = size.x.use, + size.y.use = size.y.use, + size.title.use = size.title.use, + adjust.use = adjust.use, + point.size.use = point.size.use, + cols.use = cols.use, + gene.names = gene.names, + y.log = y.log, + x.lab.rot = x.lab.rot, + y.lab.rot = y.lab.rot, + legend.position = legend.position, + remove.legend = remove.legend + )) + } + ) + if (length(x = features.plot) > 1) { + plots.combined <- plot_grid(plotlist = plots, ncol = nCol) + if (single.legend && !remove.legend) { + legend <- get_legend( + plot = plots[[1]] + theme(legend.position = legend.position) + ) + if (legend.position == "bottom") { + plots.combined <- plot_grid( + plots.combined, + legend, + ncol = 1, + rel_heights = c(1, .2) + ) + } else if (legend.position == "right") { + plots.combined <- plot_grid( + plots.combined, + legend, + rel_widths = c(3, .3) + ) + } else { + warning("Shared legends must be at the bottom or right of the plot") + } + } + } else { + plots.combined <- plots[[1]] + } + if (do.return) { + if (return.plotlist) { + return(plots) + } else { + return(plots.combined) + } + } else { + if (length(x = plots.combined) > 1) { + plots.combined + } + else { + invisible(x = lapply(X = plots.combined, FUN = print)) + } + } +} + +#' Dot plot visualization +#' +#' Intuitive way of visualizing how gene expression changes across different identity classes (clusters). +#' The size of the dot encodes the percentage of cells within a class, while the color encodes the +#' AverageExpression level of 'expressing' cells (green is high). +#' +#' @param object Seurat object +#' @param genes.plot Input vector of genes +#' @param cex.use Scaling factor for the dots (scales all dot sizes) +#' @param cols.use colors to plot +#' @param thresh.col The raw data value which corresponds to a red dot (lowest expression) +#' @param dot.min The fraction of cells at which to draw the smallest dot (default is 0.05) +#' @param group.by Factor to group the cells by +#' @return Only graphical output +#' @export +DotPlot <- function( + object, + genes.plot, + cex.use = 2, + cols.use = NULL, + thresh.col = 2.5, + dot.min = 0.05, + group.by = NULL, + ... +) { + if (! is.null(x = group.by)) { + object <- SetAllIdent(object = object, id = group.by) + } + #object@data=object@data[genes.plot,] + object@data <- data.frame(t(x = FetchData(object = object, vars.all = genes.plot))) + #this line is in case there is a '-' in the cell name + colnames(x = object@data) <- object@cell.names + avg.exp <- AverageExpression(object = object) + avg.alpha <- AverageDetectionRate(object = object) + cols.use <- SetIfNull(x = cols.use, default = CustomPalette(low = "red", high = "green")) + exp.scale <- t(x = scale(x = t(x = avg.exp))) + exp.scale <- MinMax(data = exp.scale, max = thresh.col, min = (-1) * thresh.col) + n.col <- length(x = cols.use) + data.y <- rep(x = 1:ncol(x = avg.exp), nrow(x = avg.exp)) + data.x <- unlist(x = lapply(X = 1:nrow(x = avg.exp), FUN = rep, ncol(x = avg.exp))) + data.avg <- unlist(x = lapply( + X = 1:length(x = data.y), + FUN = function(x) { + return(exp.scale[data.x[x], data.y[x]]) + } + )) + exp.col <- cols.use[floor( + x = n.col * (data.avg + thresh.col) / (2 * thresh.col) + .5 + )] + data.cex <- unlist(x = lapply( + X = 1:length(x = data.y), + FUN = function(x) { + return(avg.alpha[data.x[x], data.y[x]]) + } + )) * cex.use + dot.min + plot( + x = data.x, + y = data.y, + cex = data.cex, + pch = 16, + col = exp.col, + xaxt = "n", + xlab = "", + ylab = "", + yaxt = "n" + ) + axis(side = 1, at = 1:length(x = genes.plot), labels = genes.plot) + axis(side = 2, at = 1:ncol(x = avg.alpha), colnames(x = avg.alpha), las = 1) +} + +#' Dot plot visualization +#' +#' Intuitive way of visualizing how gene expression changes across different identity classes (clusters). +#' The size of the dot encodes the percentage of cells within a class, while the color encodes the +#' AverageExpression level of 'expressing' cells (green is high). +#' +#' @param object Seurat object +#' @param genes.plot Input vector of genes +#' @param cols.use colors to plot +#' @param col.min Minimum scaled average expression threshold (everything smaller will be set to this) +#' @param col.max Maximum scaled average expression threshold (everything larger will be set to this) +#' @param dot.min The fraction of cells at which to draw the smallest dot (default is 0.05). +#' @param dot.scale Scale the size of the points, similar to cex +#' @param group.by Factor to group the cells by +#' @param plot.legend plots the legends +#' @param x.lab.rot Rotate x-axis labels +#' @param do.return Return ggplot2 object +#' @return default, no return, only graphical output. If do.return=TRUE, returns a ggplot2 object +#' @importFrom dplyr %>% group_by summarize_each mutate ungroup +#' @importFrom tidyr gather +#' @export +DotPlotGG <- function( + object, + genes.plot, + cols.use = c("green", "red"), + col.min = -2.5, + col.max = 2.5, + dot.min = 0, + dot.scale = 6, + group.by, + plot.legend = FALSE, + do.return = FALSE, + x.lab.rot = FALSE +) { + if (! missing(x = group.by)) { + object <- SetAllIdent(object = object, id = group.by) + } + data.to.plot <- data.frame(FetchData(object = object, vars.all = genes.plot)) + data.to.plot$cell <- rownames(x = data.to.plot) + data.to.plot$id <- object@ident + data.to.plot %>% gather( + key = genes.plot, + value = expression, + -c(cell, id) + ) -> data.to.plot + data.to.plot %>% + group_by(id, genes.plot) %>% + summarize( + avg.exp = ExpMean(x = expression), + pct.exp = PercentAbove(x = expression, threshold = 0) + ) -> data.to.plot + data.to.plot %>% + ungroup() %>% + group_by(genes.plot) %>% + mutate(avg.exp = as.numeric(x = scale(center = avg.exp))) %>% + mutate(avg.exp.scale = MinMax( + data = avg.exp, + max = col.max, + min = col.min + )) -> data.to.plot + data.to.plot$genes.plot <- factor( + x = data.to.plot$genes.plot, + levels = rev(x = sub(pattern = "-", replacement = ".", x = genes.plot)) + ) + data.to.plot$pct.exp[data.to.plot$pct.exp < dot.min] <- NA + p <- ggplot(data = data.to.plot, mapping = aes(x = genes.plot, y = id)) + + geom_point(mapping = aes(size = pct.exp, color = avg.exp.scale)) + + scale_radius(range = c(0, dot.scale)) + + scale_color_gradient(low = cols.use[1], high = cols.use[2]) + + theme(axis.title.x = element_blank(), axis.title.y = element_blank()) + if (! plot.legend) { + p <- p + theme(legend.position = "none") + } + if (x.lab.rot) { + p <- p + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + } + suppressWarnings(print(p)) + if (do.return) { + return(p) + } +} + + +#' Split Dot plot visualization +#' +#' Intuitive way of visualizing how gene expression changes across different identity classes (clusters). +#' The size of the dot encodes the percentage of cells within a class, while the color encodes the +#' AverageExpression level of 'expressing' cells (green is high). Splits the cells into two groups based on a +#' grouping variable. +#' Still in BETA +#' +#' @param object Seurat object +#' @param grouping.var Grouping variable for splitting the dataset +#' @param genes.plot Input vector of genes +#' @param cols.use colors to plot +#' @param col.min Minimum scaled average expression threshold (everything smaller will be set to this) +#' @param col.max Maximum scaled average expression threshold (everything larger will be set to this) +#' @param dot.min The fraction of cells at which to draw the smallest dot (default is 0.05). +#' @param dot.scale Scale the size of the points, similar to cex +#' @param group.by Factor to group the cells by +#' @param plot.legend plots the legends +#' @param x.lab.rot Rotate x-axis labels +#' @param do.return Return ggplot2 object +#' @param gene.groups Add labeling bars to the top of the plot +#' @return default, no return, only graphical output. If do.return=TRUE, returns a ggplot2 object +#' @importFrom dplyr %>% group_by summarize_each mutate ungroup +#' @importFrom tidyr gather +#' @export +SplitDotPlotGG <- function( + object, + grouping.var, + genes.plot, + gene.groups, + cols.use = c("green", "red"), + col.min = -2.5, + col.max = 2.5, + dot.min = 0, + dot.scale = 6, + group.by, + plot.legend = FALSE, + do.return = FALSE, + x.lab.rot = FALSE +) { + if (! missing(x = group.by)) { + object <- SetAllIdent(object = object, id = group.by) + } + grouping.data <- FetchData( + object = object, + vars.all = grouping.var + )[names(x = object@ident), 1] + idents.old <- levels(x = object@ident) + object@ident <- paste(object@ident, grouping.data, sep="_") + object@ident <- factor( + x = object@ident, + levels = unlist(x = lapply( + X = idents.old, + FUN = function(x) { + return(c( + paste(x, unique(x = grouping.data)[1], sep="_"), + paste(x, unique(x = grouping.data)[2], sep="_") + )) + } + )), + ordered = TRUE + ) + data.to.plot <- data.frame(FetchData(object = object, vars.all = genes.plot)) + data.to.plot$cell <- rownames(x = data.to.plot) + data.to.plot$id <- object@ident + data.to.plot %>% + gather(key = genes.plot, value = expression, -c(cell, id)) -> data.to.plot + data.to.plot %>% + group_by(id, genes.plot) %>% + summarize( + avg.exp = ExpMean(x = expression), + pct.exp = PercentAbove(x = expression, threshold = 0) + ) -> data.to.plot + ids.2 <- paste( + idents.old, + as.character(x = unique(x = grouping.data)[2]), + sep = "_" + ) + vals.2 <- which(x = data.to.plot$id %in% ids.2) + ids.1 <- paste( + idents.old, + as.character(x = unique(x = grouping.data)[1]), + sep = "_" + ) + vals.1 <- which(x = data.to.plot$id %in% ids.1) + #data.to.plot[vals.2,3]=-1*data.to.plot[vals.2,3] + data.to.plot %>% + ungroup() %>% + group_by(genes.plot) %>% + mutate(avg.exp = scale(avg.exp)) %>% + mutate(avg.exp.scale = as.numeric(x = cut( + x = MinMax(data = avg.exp, max = col.max, min = col.min), + breaks = 20 + ))) -> data.to.plot + data.to.plot$genes.plot <- factor( + x = data.to.plot$genes.plot, + levels = rev(x = sub(pattern = "-", replacement = ".", x = genes.plot)) + ) + data.to.plot$pct.exp[data.to.plot$pct.exp < dot.min] <- NA + palette.1 <- CustomPalette(low = "grey", high = "blue", k = 20) + palette.2 <- CustomPalette(low = "grey", high = "red", k = 20) + data.to.plot$ptcolor <- "grey" + data.to.plot[vals.1, "ptcolor"] <- palette.1[as.matrix( + x = data.to.plot[vals.1, "avg.exp.scale"] + )[, 1]] + data.to.plot[vals.2, "ptcolor"] <- palette.2[as.matrix( + x = data.to.plot[vals.2, "avg.exp.scale"] + )[, 1]] + if (! missing(x = gene.groups)) { + names(x = gene.groups) <- genes.plot + data.to.plot %>% + mutate(gene.groups = gene.groups[genes.plot]) -> data.to.plot + } + p <- ggplot(data = data.to.plot, mapping = aes(x = genes.plot, y = id)) + + geom_point(mapping = aes(size = pct.exp, color = ptcolor)) + + scale_radius(range = c(0, dot.scale)) + + scale_color_identity() + + theme(axis.title.x = element_blank(), axis.title.y = element_blank()) + if (! missing(x = gene.groups)) { + p <- p + + facet_grid( + facets = ~gene.groups, + scales = "free_x", + space = "free_x", + switch = "y" + ) + + theme( + panel.spacing = unit(x = 1, units = "lines"), + strip.background = element_blank(), + strip.placement = "outside" + ) + } + if (! plot.legend) { + p <- p + theme(legend.position = "none") + } + if (x.lab.rot) { + p <- p + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + } + suppressWarnings(print(p)) + if (do.return) { + return(p) + } +} + +#' Visualize 'features' on a dimensional reduction plot +#' +#' Colors single cells on a dimensional reduction plot according to a 'feature' +#' (i.e. gene expression, PC scores, number of genes detected, etc.) +#' +#' +#' @param object Seurat object +#' @param features.plot Vector of features to plot +#' @param min.cutoff Vector of minimum cutoff values for each feature, may specify quantile in the form of 'q##' where '##' is the quantile (eg, 1, 10) +#' @param max.cutoff Vector of maximum cutoff values for each feature, may specify quantile in the form of 'q##' where '##' is the quantile (eg, 1, 10) +#' @param dim.1 Dimension for x-axis (default 1) +#' @param dim.2 Dimension for y-axis (default 2) +#' @param cells.use Vector of cells to plot (default is all cells) +#' @param pt.size Adjust point size for plotting +#' @param cols.use The two colors to form the gradient over. Provide as string vector with +#' the first color corresponding to low values, the second to high. Also accepts a Brewer +#' color scale or vector of colors. Note: this will bin the data into number of colors provided. +#' @param pch.use Pch for plotting +#' @param overlay Plot two features overlayed one on top of the other +#' @param do.hover Enable hovering over points to view information +#' @param data.hover Data to add to the hover, pass a character vector of features to add. Defaults to cell name +#' @param do.identify Opens a locator session to identify clusters of cells +#' @param reduction.use Which dimensionality reduction to use. Default is +#' "tsne", can also be "pca", or "ica", assuming these are precomputed. +#' @param use.imputed Use imputed values for gene expression (default is FALSE) +#' @param nCol Number of columns to use when plotting multiple features. +#' @param no.axes Remove axis labels +#' @param no.legend Remove legend from the graph. Default is TRUE. +#' @param dark.theme Plot in a dark theme +#' @param do.return return the ggplot2 object +#' +#' @importFrom RColorBrewer brewer.pal.info +#' +#' @return No return value, only a graphical output +#' +#' @export +#' +FeaturePlot <- function( + object, + features.plot, + min.cutoff = NA, + max.cutoff = NA, + dim.1 = 1, + dim.2 = 2, + cells.use = NULL, + pt.size = 1, + cols.use = c("yellow", "red"), + pch.use = 16, + overlay = FALSE, + do.hover = FALSE, + data.hover = NULL, + do.identify = FALSE, + reduction.use = "tsne", + use.imputed = FALSE, + nCol = NULL, + no.axes = FALSE, + no.legend = TRUE, + dark.theme = FALSE, + do.return = FALSE +) { + cells.use <- SetIfNull(x = cells.use, default = colnames(x = object@data)) + if (is.null(x = nCol)) { + nCol <- 2 + if (length(x = features.plot) == 1) { + nCol <- 1 + } + if (length(x = features.plot) > 6) { + nCol <- 3 + } + if (length(x = features.plot) > 9) { + nCol <- 4 + } + } + num.row <- floor(x = length(x = features.plot) / nCol - 1e-5) + 1 + if (overlay | do.hover) { + num.row <- 1 + nCol <- 1 + } + par(mfrow = c(num.row, nCol)) + dim.code <- GetDimReduction( + object = object, + reduction.type = reduction.use, + slot = 'key' + ) + dim.codes <- paste0(dim.code, c(dim.1, dim.2)) + data.plot <- as.data.frame(GetCellEmbeddings(object = object, + reduction.type = reduction.use, + dims.use = c(dim.1, dim.2), + cells.use = cells.use)) + x1 <- paste0(dim.code, dim.1) + x2 <- paste0(dim.code, dim.2) + data.plot$x <- data.plot[, x1] + data.plot$y <- data.plot[, x2] + data.plot$pt.size <- pt.size + names(x = data.plot) <- c('x', 'y') + # data.plot$pt.size <- pt.size + data.use <- t(x = FetchData( + object = object, + vars.all = features.plot, + cells.use = cells.use, + use.imputed = use.imputed + )) + # Check mins and maxes + min.cutoff <- mapply( + FUN = function(cutoff, feature) { + ifelse( + test = is.na(x = cutoff), + yes = min(data.use[feature, ]), + no = cutoff + ) + }, + cutoff = min.cutoff, + feature = features.plot + ) + max.cutoff <- mapply( + FUN = function(cutoff, feature) { + ifelse( + test = is.na(x = cutoff), + yes = max(data.use[feature, ]), + no = cutoff + ) + }, + cutoff = max.cutoff, + feature = features.plot + ) + check_lengths = unique(x = vapply( + X = list(features.plot, min.cutoff, max.cutoff), + FUN = length, + FUN.VALUE = numeric(length = 1) + )) + if (length(x = check_lengths) != 1) { + stop('There must be the same number of minimum and maximum cuttoffs as there are features') + } + if (overlay) { + # Wrap as a list for MutiPlotList + pList <- list( + BlendPlot( + data.use = data.use, + features.plot = features.plot, + data.plot = data.plot, + pt.size = pt.size, + pch.use = pch.use, + cols.use = cols.use, + dim.codes = dim.codes, + min.cutoff = min.cutoff, + max.cutoff = max.cutoff, + no.axes = no.axes, + no.legend = no.legend, + dark.theme = dark.theme + ) + ) + } else { + # Use mapply instead of lapply for multiple iterative variables. + pList <- mapply( + FUN = SingleFeaturePlot, + feature = features.plot, + min.cutoff = min.cutoff, + max.cutoff = max.cutoff, + MoreArgs = list( # Arguments that are not being repeated + data.use = data.use, + data.plot = data.plot, + pt.size = pt.size, + pch.use = pch.use, + cols.use = cols.use, + dim.codes = dim.codes, + no.axes = no.axes, + no.legend = no.legend, + dark.theme = dark.theme + ), + SIMPLIFY = FALSE # Get list, not matrix + ) + } + if (do.hover) { + if (length(x = pList) != 1) { + stop("'do.hover' only works on a single feature or an overlayed FeaturePlot") + } + if (is.null(x = data.hover)) { + features.info <- NULL + } else { + features.info <- FetchData(object = object, vars.all = data.hover) + } + # Use pList[[1]] to properly extract the ggplot out of the plot list + return(HoverLocator( + plot = pList[[1]], + data.plot = data.plot, + features.info = features.info, + dark.theme = dark.theme, + title = features.plot + )) + # invisible(readline(prompt = 'Press to continue\n')) + } else if (do.identify) { + if (length(x = pList) != 1) { + stop("'do.identify' only works on a single feature or an overlayed FeaturePlot") + } + # Use pList[[1]] to properly extract the ggplot out of the plot list + return(FeatureLocator( + plot = pList[[1]], + data.plot = data.plot, + dark.theme = dark.theme + )) + } else { + print(x = cowplot::plot_grid(plotlist = pList, ncol = nCol)) + } + ResetPar() + if (do.return){ + return(pList) + } +} + +#' Vizualization of multiple features +#' +#' Similar to FeaturePlot, however, also splits the plot by visualizing each +#' identity class separately. +#' +#' Particularly useful for seeing if the same groups of cells co-exhibit a +#' common feature (i.e. co-express a gene), even within an identity class. Best +#' understood by example. +#' +#' @param object Seurat object +#' @param features.plot Vector of features to plot +#' @param dim.1 Dimension for x-axis (default 1) +#' @param dim.2 Dimension for y-axis (default 2) +#' @param idents.use Which identity classes to display (default is all identity +#' classes) +#' @param pt.size Adjust point size for plotting +#' @param cols.use Ordered vector of colors to use for plotting. Default is +#' heat.colors(10). +#' @param pch.use Pch for plotting +#' @param reduction.use Which dimensionality reduction to use. Default is +#' "tsne", can also be "pca", or "ica", assuming these are precomputed. +#' @param group.by Group cells in different ways (for example, orig.ident) +#' @param sep.scale Scale each group separately. Default is FALSE. +#' @param max.exp Max cutoff for scaled expression value +#' @param min.exp Min cutoff for scaled expression value +#' @param rotate.key rotate the legend +#' @param plot.horiz rotate the plot such that the features are columns, groups are the rows +#' @param key.position position of the legend ("top", "right", "bottom", "left") +#' @param do.return Return the ggplot2 object +#' +#' @return No return value, only a graphical output +#' +#' @importFrom dplyr %>% mutate_each group_by select ungroup +#' +#' @export +#' +FeatureHeatmap <- function( + object, + features.plot, + dim.1 = 1, + dim.2 = 2, + idents.use = NULL, + pt.size = 2, + cols.use = c("grey", "red"), + pch.use = 16, + reduction.use = "tsne", + group.by = NULL, + sep.scale = FALSE, + do.return = FALSE, + min.exp = -Inf, + max.exp = Inf, + rotate.key = FALSE, + plot.horiz = FALSE, + key.position = "right" +) { + if (! is.null(x = group.by)) { + object <- SetAllIdent(object = object, id = group.by) + } + idents.use <- SetIfNull(x = idents.use, default = sort(x = unique(x = object@ident))) + par(mfrow = c(length(x = features.plot), length(x = idents.use))) + dim.code <- GetDimReduction( + object = object, + reduction.type = reduction.use, + slot = 'key' + ) + dim.codes <- paste0(dim.code, c(dim.1, dim.2)) + data.plot <- data.frame(FetchData( + object = object, + vars.all = c(dim.codes, features.plot) + )) + colnames(x = data.plot)[1:2] <- c("dim1", "dim2") + data.plot$ident <- as.character(x = object@ident) + data.plot$cell <- rownames(x = data.plot) + features.plot <- gsub('-', '\\.', features.plot) + data.plot %>% gather(gene, expression, features.plot, -dim1, -dim2, -ident, -cell) -> data.plot + if (sep.scale) { + data.plot %>% group_by(ident, gene) %>% mutate(scaled.expression = scale(expression)) -> data.plot + } else { + data.plot %>% group_by(gene) %>% mutate(scaled.expression = scale(expression)) -> data.plot + } + data.plot$gene <- factor(x = data.plot$gene, levels = features.plot) + data.plot$scaled.expression <- MinMax( + data = data.plot$scaled.expression, + min = min.exp, + max = max.exp + ) + if (rotate.key) { + key.direction <- "horizontal" + key.title.pos <- "top" + } else { + key.direction <- "vertical" + key.title.pos <- "left" + } + p <- ggplot(data = data.plot, mapping = aes(x = dim1, y = dim2)) + + geom_point(mapping = aes(colour = scaled.expression), size = pt.size) + if (rotate.key) { + p <- p + scale_colour_gradient( + low = cols.use[1], + high = cols.use[2], + guide = guide_colorbar( + direction = key.direction, + title.position = key.title.pos, + title = "Scaled Expression" + ) + ) + } else { + p <- p + scale_colour_gradient( + low = cols.use[1], + high = cols.use[2], + guide = guide_colorbar(title = "Scaled Expression") + ) + } + if(plot.horiz){ + p <- p + facet_grid(ident ~ gene) + } + else{ + p <- p + facet_grid(gene ~ ident) + } + p2 <- p + + theme_bw() + + NoGrid() + + ylab(label = dim.codes[2]) + + xlab(label = dim.codes[1]) + p2 <- p2 + theme(legend.position = key.position) + if (do.return) { + return(p2) + } + print(p2) +} + +#' Gene expression heatmap +#' +#' Draws a heatmap of single cell gene expression using the heatmap.2 function. Has been replaced by the ggplot2 +#' version (now in DoHeatmap), but kept for legacy +#' +#' @param object Seurat object +#' @param cells.use Cells to include in the heatmap (default is all cells) +#' @param genes.use Genes to include in the heatmap (ordered) +#' @param disp.min Minimum display value (all values below are clipped) +#' @param disp.max Maximum display value (all values above are clipped) +#' @param draw.line Draw vertical lines delineating cells in different identity +#' classes. +#' @param do.return Default is FALSE. If TRUE, return a matrix of scaled values +#' which would be passed to heatmap.2 +#' @param order.by.ident Order cells in the heatmap by identity class (default +#' is TRUE). If FALSE, cells are ordered based on their order in cells.use +#' @param col.use Color palette to use +#' @param slim.col.label if (order.by.ident==TRUE) then instead of displaying +#' every cell name on the heatmap, display only the identity class name once +#' for each group +#' @param group.by If (order.by.ident==TRUE) default, you can group cells in +#' different ways (for example, orig.ident) +#' @param remove.key Removes the color key from the plot. +#' @param cex.col positive numbers, used as cex.axis in for the column axis labeling. +#' The defaults currently only use number of columns +#' @param do.scale whether to use the data or scaled data +#' @param ... Additional parameters to heatmap.2. Common examples are cexRow +#' and cexCol, which set row and column text sizes +#' +#' @return If do.return==TRUE, a matrix of scaled values which would be passed +#' to heatmap.2. Otherwise, no return value, only a graphical output +#' +#' @importFrom gplots heatmap.2 +#' +#' @export +#' +OldDoHeatmap <- function( + object, + cells.use = NULL, + genes.use = NULL, + disp.min = NULL, + disp.max = NULL, + draw.line = TRUE, + do.return = FALSE, + order.by.ident = TRUE, + col.use = pyCols, + slim.col.label = FALSE, + group.by = NULL, + remove.key = FALSE, + cex.col = NULL, + do.scale = TRUE, + ... +) { + cells.use <- SetIfNull(x = cells.use, default = object@cell.names) + cells.use <- intersect(x = cells.use, y = object@cell.names) + cells.ident <- object@ident[cells.use] + if (! is.null(x = group.by)) { + cells.ident <- factor(x = FetchData( + object = object, + vars.all = group.by + )[, 1]) + } + cells.ident <- factor( + x = cells.ident, + labels = intersect(x = levels(x = cells.ident), y = cells.ident) + ) + if (order.by.ident) { + cells.use <- cells.use[order(cells.ident)] + } else { + cells.ident <- factor( + x = cells.ident, + levels = as.vector(x = unique(x = cells.ident)) + ) + } + #determine assay type + data.use <- NULL + assays.use <- c("RNA", names(x = object@assay)) + if (do.scale) { + slot.use <- "scale.data" + if ((is.null(x = disp.min) || is.null(x = disp.max))) { + disp.min <- -2.5 + disp.max <- 2.5 + } + } else { + slot.use <- "data" + if ((is.null(x = disp.min) || is.null(x = disp.max))) { + disp.min <- -Inf + disp.max <- Inf + } + } + for (assay.check in assays.use) { + data.assay <- GetAssayData( + object = object, + assay.type = assay.check, + slot = slot.use + ) + genes.intersect <- intersect(x = genes.use, y = rownames(x = data.assay)) + new.data <- data.assay[genes.intersect, cells.use, drop = FALSE] + if (! (is.matrix(x = new.data))) { + new.data <- as.matrix(x = new.data) + } + data.use <- rbind(data.use, new.data) + } + data.use <- MinMax(data = data.use, min = disp.min, max = disp.max) + vline.use <- NULL + colsep.use <- NULL + if (remove.key) { + hmFunction <- heatmap2NoKey + } else { + hmFunction <- heatmap.2 + } + if (draw.line) { + colsep.use <- cumsum(x = table(cells.ident)) + } + if (slim.col.label && order.by.ident) { + col.lab <- rep("", length(x = cells.use)) + col.lab[round(x = cumsum(x = table(cells.ident)) - table(cells.ident) / 2) + 1] <- levels(x = cells.ident) + cex.col <- SetIfNull( + x = cex.col, + default = 0.2 + 1 / log10(x = length(x = unique(x = cells.ident))) + ) + hmFunction( + data.use, + Rowv = NA, + Colv = NA, + trace = "none", + col = col.use, + colsep = colsep.use, + labCol = col.lab, + cexCol = cex.col, + ... + ) + } else if (slim.col.label) { + col.lab = rep("", length(x = cells.use)) + cex.col <- SetIfNull( + x = cex.col, + default = 0.2 + 1 / log10(x = length(x = unique(x = cells.ident))) + ) + hmFunction( + data.use, + Rowv = NA, + Colv = NA, + trace = "none", + col = col.use, + colsep = colsep.use, + labCol = col.lab, + cexCol = cex.col, + ... + ) + } else { + hmFunction( + data.use, + Rowv = NA, + Colv = NA, + trace = "none", + col = col.use, + colsep = colsep.use, + ... + ) + } + if (do.return) { + return(data.use) + } +} + +#' JackStraw Plot +#' +#' Plots the results of the JackStraw analysis for PCA significance. For each +#' PC, plots a QQ-plot comparing the distribution of p-values for all genes +#' across each PC, compared with a uniform distribution. Also determines a +#' p-value for the overall significance of each PC (see Details). +#' +#' Significant PCs should show a p-value distribution (black curve) that is +#' strongly skewed to the left compared to the null distribution (dashed line) +#' The p-value for each PC is based on a proportion test comparing the number +#' of genes with a p-value below a particular threshold (score.thresh), compared with the +#' proportion of genes expected under a uniform distribution of p-values. +#' +#' @param object Seurat plot +#' @param PCs Which PCs to examine +#' @param nCol Number of columns +#' @param score.thresh Threshold to use for the proportion test of PC +#' significance (see Details) +#' @param plot.x.lim X-axis maximum on each QQ plot. +#' @param plot.y.lim Y-axis maximum on each QQ plot. +#' +#' @return A ggplot object +#' +#' @author Thanks to Omri Wurtzel for integrating with ggplot +#' +#' @import gridExtra +#' +#' @export +#' +JackStrawPlot <- function( + object, + PCs = 1:5, + nCol = 3, + score.thresh = 1e-5, + plot.x.lim = 0.1, + plot.y.lim = 0.3 +) { + pAll <- GetDimReduction(object,reduction.type = "pca", slot = "jackstraw")@emperical.p.value + pAll <- pAll[, PCs, drop = FALSE] + pAll <- as.data.frame(pAll) + pAll$Contig <- rownames(x = pAll) + pAll.l <- melt(data = pAll, id.vars = "Contig") + colnames(x = pAll.l) <- c("Contig", "PC", "Value") + qq.df <- NULL + score.df <- NULL + for (i in PCs) { + q <- qqplot(x = pAll[, i], y = runif(n = 1000), plot.it = FALSE) + #pc.score=mean(q$y[which(q$x <=score.thresh)]) + pc.score <- suppressWarnings(prop.test( + x = c( + length(x = which(x = pAll[, i] <= score.thresh)), + floor(x = nrow(x = pAll) * score.thresh) + ), + n = c(nrow(pAll), nrow(pAll)) + )$p.val) + if (length(x = which(x = pAll[, i] <= score.thresh)) == 0) { + pc.score <- 1 + } + if (is.null(x = score.df)) { + score.df <- data.frame(PC = paste0("PC", i), Score = pc.score) + } else { + score.df <- rbind(score.df, data.frame(PC = paste0("PC",i), Score = pc.score)) + } + if (is.null(x = qq.df)) { + qq.df <- data.frame(x = q$x, y = q$y, PC = paste0("PC", i)) + } else { + qq.df <- rbind(qq.df, data.frame(x = q$x, y = q$y, PC = paste0("PC", i))) + } + } + # create new dataframe column to wrap on that includes the PC number and score + pAll.l$PC.Score <- rep( + x = paste0(score.df$PC, " ", sprintf("%1.3g", score.df$Score)), + each = length(x = unique(x = pAll.l$Contig)) + ) + pAll.l$PC.Score <- factor( + x = pAll.l$PC.Score, + levels = paste0(score.df$PC, " ", sprintf("%1.3g", score.df$Score)) + ) + gp <- ggplot(data = pAll.l, mapping = aes(sample=Value)) + + stat_qq(distribution = qunif) + + facet_wrap("PC.Score", ncol = nCol) + + labs(x = "Theoretical [runif(1000)]", y = "Empirical") + + xlim(0, plot.y.lim) + + ylim(0, plot.x.lim) + + coord_flip() + + geom_abline(intercept = 0, slope = 1, linetype = "dashed", na.rm = TRUE) + + theme_bw() + return(gp) +} + +#' Scatter plot of single cell data +#' +#' Creates a scatter plot of two features (typically gene expression), across a +#' set of single cells. Cells are colored by their identity class. +#' +#' @param object Seurat object +#' @inheritParams FetchData +#' @param gene1 First feature to plot. Typically gene expression but can also +#' be metrics, PC scores, etc. - anything that can be retreived with FetchData +#' @param gene2 Second feature to plot. +#' @param cell.ids Cells to include on the scatter plot. +#' @param col.use Colors to use for identity class plotting. +#' @param pch.use Pch argument for plotting +#' @param cex.use Cex argument for plotting +#' @param use.imputed Use imputed values for gene expression (Default is FALSE) +#' @param use.scaled Use scaled data +#' @param use.raw Use raw data +#' @param do.hover Enable hovering over points to view information +#' @param data.hover Data to add to the hover, pass a character vector of features to add. Defaults to cell name +#' @param do.identify Opens a locator session to identify clusters of cells. +#' @param dark.theme Use a dark theme for the plot +#' @param do.spline Add a spline (currently hardwired to df=4, to be improved) +#' @param spline.span spline span in loess function call +#' @param \dots Additional arguments to be passed to plot. +#' +#' @return No return, only graphical output +#' +#' @export +#' +GenePlot <- function( + object, + gene1, + gene2, + cell.ids = NULL, + col.use = NULL, + pch.use = 16, + cex.use = 1.5, + use.imputed = FALSE, + use.scaled = FALSE, + use.raw = FALSE, + do.hover = FALSE, + data.hover = NULL, + do.identify = FALSE, + dark.theme = FALSE, + do.spline = FALSE, + spline.span = 0.75, + ... +) { + cell.ids <- SetIfNull(x = cell.ids, default = object@cell.names) + # Don't transpose the data.frame for better compatability with FeatureLocator and the rest of Seurat + data.use <- as.data.frame( + x = FetchData( + object = object, + vars.all = c(gene1, gene2), + cells.use = cell.ids, + use.imputed = use.imputed, + use.scaled = use.scaled, + use.raw = use.raw + ) + ) + # Ensure that our data is only the cells we're working with and + # the genes we want. This step seems kind of redundant though... + data.plot <- data.use[cell.ids, c(gene1, gene2)] + # Set names to 'x' and 'y' for easy calling later on + names(x = data.plot) <- c('x', 'y') + ident.use <- as.factor(x = object@ident[cell.ids]) + if (length(x = col.use) > 1) { + col.use <- col.use[as.numeric(x = ident.use)] + } else { + col.use <- SetIfNull(x = col.use, default = as.numeric(x = ident.use)) + } + gene.cor <- round(x = cor(x = data.plot$x, y = data.plot$y), digits = 2) + if (dark.theme) { + par(bg = 'black') + col.use <- sapply( + X = col.use, + FUN = function(color) ifelse( + test = all(col2rgb(color) == 0), + yes = 'white', + no = color + ) + ) + axes = FALSE + col.lab = 'white' + } else { + axes = TRUE + col.lab = 'black' + } + # Plot the data + plot( + x = data.plot$x, + y = data.plot$y, + xlab = gene1, + ylab = gene2, + col = col.use, + cex = cex.use, + main = gene.cor, + pch = pch.use, + axes = axes, + col.lab = col.lab, + col.main = col.lab, + ... + ) + if (dark.theme) { + axis( + side = 1, + at = NULL, + labels = TRUE, + col.axis = col.lab, + col = col.lab + ) + axis( + side = 2, + at = NULL, + labels = TRUE, + col.axis = col.lab, + col = col.lab + ) + } + if (do.spline) { + # spline.fit <- smooth.spline(x = g1, y = g2, df = 4) + spline.fit <- smooth.spline(x = data.plot$x, y = data.plot$y, df = 4) + #lines(spline.fit$x,spline.fit$y,lwd=3) + #spline.fit=smooth.spline(g1,g2,df = 4) + # loess.fit <- loess(formula = g2 ~ g1, span=spline.span) + loess.fit <- loess(formula = y ~ x, data = data.plot, span = spline.span) + #lines(spline.fit$x,spline.fit$y,lwd=3) + # points(x = g1, y = loess.fit$fitted, col="darkblue") + points(x = data.plot$x, y = loess.fit$fitted, col = 'darkblue') + } + if (do.identify | do.hover) { + # This is where that untransposed renamed data.frame comes in handy + p <- ggplot2::ggplot(data = data.plot, mapping = aes(x = x, y = y)) + p <- p + geom_point( + mapping = aes(color = colors), + size = cex.use, + shape = pch.use, + color = col.use + ) + p <- p + labs(title = gene.cor, x = gene1, y = gene2) + if (do.hover) { + names(x = data.plot) <- c(gene1, gene2) + if (is.null(x = data.hover)) { + features.info <- NULL + } else { + features.info <- FetchData(object = object, vars.all = data.hover) + } + return(HoverLocator( + plot = p, + data.plot = data.plot, + features.info = features.info, + dark.theme = dark.theme, + title = gene.cor + )) + } else if (do.identify) { + return(FeatureLocator( + plot = p, + data.plot = data.plot, + dark.theme = dark.theme + )) + } + } +} + +#' Cell-cell scatter plot +#' +#' Creates a plot of scatter plot of genes across two single cells +#' +#' @param object Seurat object +#' @param cell1 Cell 1 name (can also be a number, representing the position in +#' object@@cell.names) +#' @param cell2 Cell 2 name (can also be a number, representing the position in +#' object@@cell.names) +#' @param gene.ids Genes to plot (default, all genes) +#' @param col.use Colors to use for the points +#' @param nrpoints.use Parameter for smoothScatter +#' @param pch.use Point symbol to use +#' @param cex.use Point size +#' @param do.hover Enable hovering over points to view information +#' @param do.identify Opens a locator session to identify clusters of cells. +#' points to reveal gene names (hit ESC to stop) +#' @param \dots Additional arguments to pass to smoothScatter +#' +#' @return No return value (plots a scatter plot) +#' +#' @export +#' +CellPlot <- function( + object, + cell1, + cell2, + gene.ids = NULL, + col.use = "black", + nrpoints.use = Inf, + pch.use = 16, + cex.use = 0.5, + do.hover = FALSE, + do.identify = FALSE, + ... +) { + gene.ids <- SetIfNull(x = gene.ids, default = rownames(x = object@data)) + # Transpose this data.frame so that the genes are in the row for + # easy selecting with do.identify + data.plot <- as.data.frame( + x = t( + x = FetchData( + object = object, + vars.all = gene.ids, + cells.use = c(cell1, cell2) + ) + ) + ) + # Set names for easy calling with ggplot + names(x = data.plot) <- c('x', 'y') + gene.cor <- round(x = cor(x = data.plot$x, y = data.plot$y), digits = 2) + smoothScatter( + x = data.plot$x, + y = data.plot$y, + xlab = cell1, + ylab = cell2, + col = col.use, + nrpoints = nrpoints.use, + pch = pch.use, + cex = cex.use, + main = gene.cor + ) + if (do.identify | do.hover) { + # This is where that untransposed renamed data.frame comes in handy + p <- ggplot2::ggplot(data = data.plot, mapping = aes(x = x, y = y)) + p <- p + geom_point( + mapping = aes(color = colors), + size = cex.use, + shape = pch.use, + color = col.use + ) + p <- p + labs(title = gene.cor, x = cell1, y = cell2) + if (do.hover) { + names(x = data.plot) <- c(cell1, cell2) + return(HoverLocator(plot = p, data.plot = data.plot, title = gene.cor)) + } else if (do.identify) { + return(FeatureLocator(plot = p, data.plot = data.plot, ...)) + } + } +} + +#' Dimensional reduction heatmap +#' +#' Draws a heatmap focusing on a principal component. Both cells and genes are sorted by their +#' principal component scores. Allows for nice visualization of sources of heterogeneity in the dataset. +#' +#' @inheritParams DoHeatmap +#' @inheritParams PCTopGenes +#' @inheritParams VizPCA +#' @param cells.use A list of cells to plot. If numeric, just plots the top cells. +#' @param use.scale Default is TRUE: plot scaled data. If FALSE, plot raw data on the heatmap. +#' @param label.columns Whether to label the columns. Default is TRUE for 1 PC, FALSE for > 1 PC +#' +#' @return If do.return==TRUE, a matrix of scaled values which would be passed +#' to heatmap.2. Otherwise, no return value, only a graphical output +#' +#' @export +#' +DimHeatmap <- function( + object, + reduction.type = "pca", + dim.use = 1, + cells.use = NULL, + num.genes = 30, + use.full = FALSE, + disp.min = -2.5, + disp.max = 2.5, + do.return = FALSE, + col.use = pyCols, + use.scale = TRUE, + do.balanced = FALSE, + remove.key = FALSE, + label.columns = NULL, + ... +) { + num.row <- floor(x = length(x = dim.use) / 3.01) + 1 + orig_par <- par()$mfrow + par(mfrow = c(num.row, min(length(x = dim.use), 3))) + cells <- cells.use + plots <- c() + + if (is.null(x = label.columns)) { + label.columns <- ! (length(x = dim.use) > 1) + } + for (ndim in dim.use) { + if (is.numeric(x = (cells))) { + cells.use <- DimTopCells( + object = object, + dim.use = ndim, + reduction.type = reduction.type, + num.cells = cells, + do.balanced = do.balanced + ) + } else { + cells.use <- SetIfNull(x = cells, default = object@cell.names) + } + genes.use <- rev(x = DimTopGenes( + object = object, + dim.use = ndim, + reduction.type = reduction.type, + num.genes = num.genes, + use.full = use.full, + do.balanced = do.balanced + )) + dim.scores <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "cell.embeddings" + ) + dim.key <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "key" + ) + cells.ordered <- cells.use[order(dim.scores[cells.use, paste0(dim.key, ndim)])] + #determine assay type + data.use <- NULL + assays.use <- c("RNA", names(x = object@assay)) + if (! use.scale) { + slot.use="data" + } else { + slot.use <- "scale.data" + } + for (assay.check in assays.use) { + data.assay <- GetAssayData( + object = object, + assay.type = assay.check, + slot = slot.use + ) + genes.intersect <- intersect(x = genes.use, y = rownames(x = data.assay)) + new.data <- data.assay[genes.intersect, cells.ordered] + if (! is.matrix(x = new.data)) { + new.data <- as.matrix(x = new.data) + } + data.use <- rbind(data.use, new.data) + } + #data.use <- object@scale.data[genes.use, cells.ordered] + data.use <- MinMax(data = data.use, min = disp.min, max = disp.max) + #if (!(use.scale)) data.use <- as.matrix(object@data[genes.use, cells.ordered]) + vline.use <- NULL + hmTitle <- paste(dim.key, ndim) + if (remove.key || length(dim.use) > 1) { + hmFunction <- "heatmap2NoKey(data.use, Rowv = NA, Colv = NA, trace = \"none\", col = col.use, dimTitle = hmTitle, " + } else { + hmFunction <- "heatmap.2(data.use,Rowv=NA,Colv=NA,trace = \"none\",col=col.use, dimTitle = hmTitle, " + } + if (! label.columns) { + hmFunction <- paste0(hmFunction, "labCol='', ") + } + hmFunction <- paste0(hmFunction, "...)") + #print(hmFunction) + eval(expr = parse(text = hmFunction)) + } + if (do.return) { + return(data.use) + } + # reset graphics parameters + par(mfrow = orig_par) +} + +PlotDim <- function( + ndim, + object, + reduction.type, + use.scaled, + use.full, + cells.use, + num.genes, + group.by, + disp.min, + disp.max, + col.low, + col.mid, + col.high, + slim.col.label, + do.balanced, + remove.key, + cex.col, + cex.row, + group.label.loc, + group.label.rot, + group.cex, + group.spacing +) { + if (is.numeric(x = (cells.use))) { + cells.use <- DimTopCells( + object = object, + dim.use = ndim, + reduction.type = reduction.type, + num.cells = cells.use, + do.balanced = do.balanced + ) + } else { + cells.use <- SetIfNull(x = cells.use, default = object@cell.names) + } + genes.use <- rev(x = DimTopGenes( + object = object, + dim.use = ndim, + reduction.type = reduction.type, + num.genes = num.genes, + use.full = use.full, + do.balanced = do.balanced + )) + dim.scores <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "cell.embeddings" + ) + dim.key <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "key" + ) + cells.ordered <- cells.use[order(dim.scores[cells.use, paste0(dim.key, ndim)])] + if (! use.scaled) { + data.use <- as.matrix(object@data[genes.use, cells.ordered]) + } else { + data.use <- object@scale.data[genes.use, cells.ordered] + data.use <- MinMax(data = data.use, min = disp.min, max = disp.max) + } + return(DoHeatmap( + object = object, + data.use = data.use, + cells.use = cells.use, + genes.use = genes.use, + group.by = group.by, + disp.min = disp.min, + disp.max = disp.max, + col.low = col.low, + col.mid = col.mid, + col.high = col.high, + slim.col.label = slim.col.label, + remove.key = remove.key, + cex.col = cex.col, + cex.row = cex.row, + group.label.loc = group.label.loc, + group.label.rot = group.label.rot, + group.cex = group.cex, + group.spacing = group.spacing, + title = paste0(dim.key, ndim), + do.plot = FALSE + )) +} + +#' Principal component heatmap +#' +#' Draws a heatmap focusing on a principal component. Both cells and genes are sorted by their principal component scores. +#' Allows for nice visualization of sources of heterogeneity in the dataset. +#' +#' @inheritParams DoHeatmap +#' @inheritParams PCTopGenes +#' @inheritParams VizPCA +#' @param cells.use A list of cells to plot. If numeric, just plots the top cells. +#' @param use.scale Default is TRUE: plot scaled data. If FALSE, plot raw data on the heatmap. +#' @param label.columns Whether to label the columns. Default is TRUE for 1 PC, FALSE for > 1 PC +#' +#' @return If do.return==TRUE, a matrix of scaled values which would be passed +#' to heatmap.2. Otherwise, no return value, only a graphical output +#' +#' @export +#' +PCHeatmap <- function( + object, + pc.use = 1, + cells.use = NULL, + num.genes = 30, + use.full = FALSE, + disp.min = -2.5, + disp.max = 2.5, + do.return = FALSE, + col.use = pyCols, + use.scale = TRUE, + do.balanced = FALSE, + remove.key = FALSE, + label.columns = NULL, + ... +) { + return(DimHeatmap( + object, + reduction.type = "pca", + dim.use = pc.use, + cells.use = cells.use, + num.genes = num.genes, + use.full = use.full, + disp.min = disp.min, + disp.max = disp.max, + do.return = do.return, + col.use = col.use, + use.scale = use.scale, + do.balanced = do.balanced, + remove.key = remove.key, + label.columns = label.columns, + ... + )) +} + +#' Independent component heatmap +#' +#' Draws a heatmap focusing on a principal component. Both cells and genes are sorted by their +#' principal component scores. Allows for nice visualization of sources of heterogeneity +#' in the dataset."() +#' +#' @inheritParams DoHeatmap +#' @inheritParams ICTopGenes +#' @inheritParams VizICA +#' @param use.scale Default is TRUE: plot scaled data. If FALSE, plot raw data on the heatmap. +#' +#' @return If do.return==TRUE, a matrix of scaled values which would be passed +#' to heatmap.2. Otherwise, no return value, only a graphical output +#' +#' @export +#' +ICHeatmap <- function( + object, + ic.use = 1, + cells.use = NULL, + num.genes = 30, + disp.min = -2.5, + disp.max = 2.5, + do.return = FALSE, + col.use = pyCols, + use.scale = TRUE, + do.balanced = FALSE, + remove.key = FALSE, + label.columns = NULL, + ... +) { + return(DimHeatmap( + object = object, + reduction.type = "ica", + dim.use = ic.use, + cells.use = cells.use, + num.genes = num.genes, + disp.min = disp.min, + disp.max = disp.max, + do.return = do.return, + col.use = col.use, + use.scale = use.scale, + do.balanced = do.balanced, + remove.key = remove.key, + label.columns = label.columns, + ... + )) +} + + +#' Visualize Dimensional Reduction genes +#' +#' Visualize top genes associated with reduction components +#' +#' @param object Seurat object +#' @param reduction.type Reduction technique to visualize results for +#' @param dims.use Number of dimensions to display +#' @param num.genes Number of genes to display +#' @param use.full Use reduction values for full dataset (i.e. projected dimensional reduction values) +#' @param font.size Font size +#' @param nCol Number of columns to display +#' @param do.balanced Return an equal number of genes with + and - scores. If FALSE (default), returns +#' the top genes ranked by the scores absolute values +#' +#' @return Graphical, no return value +#' +#' @export +#' +VizDimReduction <- function( + object, + reduction.type = "pca", + dims.use = 1:5, + num.genes = 30, + use.full = FALSE, + font.size = 0.5, + nCol = NULL, + do.balanced = FALSE +) { + if (use.full) { + dim.scores <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "gene.loadings.full" + ) + } else { + dim.scores <- GetDimReduction( + object = object, + reduction.type = reduction.type, + slot = "gene.loadings" + ) + } + if (is.null(x = nCol)) { + if (length(x = dims.use) > 6) { + nCol <- 3 + } else if (length(x = dims.use) > 9) { + nCol <- 4 + } else { + nCol <- 2 + } + } + num.row <- floor(x = length(x = dims.use) / nCol - 1e-5) + 1 + par(mfrow = c(num.row, nCol)) + for (i in dims.use) { + subset.use <- dim.scores[DimTopGenes( + object = object, + dim.use = i, + reduction.type = reduction.type, + num.genes = num.genes, + use.full = use.full, + do.balanced = do.balanced + ), ] + plot( + x = subset.use[, i], + y = 1:nrow(x = subset.use), + pch = 16, + col = "blue", + xlab = paste0("PC", i), + yaxt="n", + ylab="" + ) + axis( + side = 2, + at = 1:nrow(x = subset.use), + labels = rownames(x = subset.use), + las = 1, + cex.axis = font.size + ) + } + ResetPar() +} + +#' Visualize PCA genes +#' +#' Visualize top genes associated with principal components +#' +#' @param object Seurat object +#' @param pcs.use Number of PCs to display +#' @param num.genes Number of genes to display +#' @param use.full Use full PCA (i.e. the projected PCA, by default FALSE) +#' @param font.size Font size +#' @param nCol Number of columns to display +#' @param do.balanced Return an equal number of genes with both + and - PC scores. +#' If FALSE (by default), returns the top genes ranked by the score's absolute values +#' +#' @return Graphical, no return value +#' +#' @export +#' +VizPCA <- function( + object, + pcs.use = 1:5, + num.genes = 30, + use.full = FALSE, + font.size = 0.5, + nCol = NULL, + do.balanced = FALSE +) { + VizDimReduction( + object = object, + reduction.type = "pca", + dims.use = pcs.use, + num.genes = num.genes, + use.full = use.full, + font.size = font.size, + nCol = nCol, + do.balanced = do.balanced + ) +} + +#' Visualize ICA genes +#' +#' Visualize top genes associated with principal components +#' +#' @param object Seurat object +#' @param ics.use Number of ICs to display +#' @param num.genes Number of genes to display +#' @param use.full Use full ICA (i.e. the projected ICA, by default FALSE) +#' @param font.size Font size +#' @param nCol Number of columns to display +#' @param do.balanced Return an equal number of genes with both + and - IC scores. +#' If FALSE (by default), returns the top genes ranked by the score's absolute values +#' +#' @return Graphical, no return value +#' +#' @export +#' +VizICA <- function( + object, + ics.use = 1:5, + num.genes = 30, + use.full = FALSE, + font.size = 0.5, + nCol = NULL, + do.balanced = FALSE +) { + VizDimReduction( + object = object, + reduction.type = "ica", + dims.use = pcs.use, + num.genes = num.genes, + use.full = use.full, + font.size = font.size, + nCol = nCol, + do.balanced = do.balanced + ) +} + +#' Dimensional reduction plot +#' +#' Graphs the output of a dimensional reduction technique (PCA by default). +#' Cells are colored by their identity class. +#' +#' @param object Seurat object +#' @param reduction.use Which dimensionality reduction to use. Default is +#' "pca", can also be "tsne", or "ica", assuming these are precomputed. +#' @param dim.1 Dimension for x-axis (default 1) +#' @param dim.2 Dimension for y-axis (default 2) +#' @param cells.use Vector of cells to plot (default is all cells) +#' @param pt.size Adjust point size for plotting +#' @param do.return Return a ggplot2 object (default : FALSE) +#' @param do.bare Do only minimal formatting (default : FALSE) +#' @param cols.use Vector of colors, each color corresponds to an identity +#' class. By default, ggplot assigns colors. +#' @param group.by Group (color) cells in different ways (for example, orig.ident) +#' @param pt.shape If NULL, all points are circles (default). You can specify any +#' cell attribute (that can be pulled with FetchData) allowing for both different colors and +#' different shapes on cells. +#' @param do.hover Enable hovering over points to view information +#' @param data.hover Data to add to the hover, pass a character vector of features to add. Defaults to cell name +#' @param do.identify Opens a locator session to identify clusters of cells. +#' @param do.label Whether to label the clusters +#' @param label.size Sets size of labels +#' @param no.legend Setting to TRUE will remove the legend +#' @param no.axes Setting to TRUE will remove the axes +#' @param dark.theme Use a dark theme for the plot +#' +#' @return If do.return==TRUE, returns a ggplot2 object. Otherwise, only +#' graphical output. +#' +#' @import SDMTools +#' @importFrom dplyr summarize group_by +#' +#' @export +#' +DimPlot <- function( + object, + reduction.use = "pca", + dim.1 = 1, + dim.2 = 2, + cells.use = NULL, + pt.size = 3, + do.return = FALSE, + do.bare = FALSE, + cols.use = NULL, + group.by = "ident", + pt.shape = NULL, + do.hover = FALSE, + data.hover=NULL, + do.identify = FALSE, + do.label = FALSE, + label.size = 1, + no.legend = FALSE, + no.axes = FALSE, + dark.theme = FALSE, + ... +) { + embeddings.use = GetDimReduction(object = object, reduction.type = reduction.use, slot = "cell.embeddings") + if (length(x = embeddings.use) == 0) { + stop(paste(reduction.use, "has not been run for this object yet.")) + } + cells.use <- SetIfNull(x = cells.use, default = colnames(x = object@data)) + dim.code <- GetDimReduction( + object = object, + reduction.type = reduction.use, + slot = "key" + ) + dim.codes <- paste0(dim.code, c(dim.1, dim.2)) + data.plot <- as.data.frame(x = embeddings.use) + # data.plot <- as.data.frame(GetDimReduction(object, reduction.type = reduction.use, slot = "")) + cells.use <- intersect(x = cells.use, y = rownames(x = data.plot)) + data.plot <- data.plot[cells.use, dim.codes] + ident.use <- as.factor(x = object@ident[cells.use]) + if (group.by != "ident") { + ident.use <- as.factor(x = FetchData( + object = object, + vars.all = group.by + )[cells.use, 1]) + } + data.plot$ident <- ident.use + data.plot$x <- data.plot[, dim.codes[1]] + data.plot$y <- data.plot[, dim.codes[2]] + data.plot$pt.size <- pt.size + p <- ggplot(data = data.plot, mapping = aes(x = x, y = y)) + + geom_point(mapping = aes(colour = factor(x = ident)), size = pt.size) + if (! is.null(x = pt.shape)) { + shape.val <- FetchData(object = object, vars.all = pt.shape)[cells.use, 1] + if (is.numeric(shape.val)) { + shape.val <- cut(x = shape.val, breaks = 5) + } + data.plot[, "pt.shape"] <- shape.val + p <- ggplot(data = data.plot, mapping = aes(x = x, y = y)) + + geom_point( + mapping = aes(colour = factor(x = ident), shape = factor(x = pt.shape)), + size = pt.size + ) + } + if (! is.null(x = cols.use)) { + p <- p + scale_colour_manual(values = cols.use) + } + p2 <- p + + xlab(label = dim.codes[[1]]) + + ylab(label = dim.codes[[2]]) + + scale_size(range = c(pt.size, pt.size)) + p3 <- p2 + + SetXAxisGG() + + SetYAxisGG() + + SetLegendPointsGG(x = 6) + + SetLegendTextGG(x = 12) + + no.legend.title + + theme_bw() + + NoGrid() + p3 <- p3 + theme(legend.title = element_blank()) + if (do.label) { + data.plot %>% + dplyr::group_by(ident) %>% + summarize(x = median(x = x), y = median(x = y)) -> centers + p3 <- p3 + + geom_point(data = centers, mapping = aes(x = x, y = y), size = 0, alpha = 0) + + geom_text(data = centers, mapping = aes(label = ident), size = label.size) + } + if (dark.theme) { + p <- p + DarkTheme() + p3 <- p3 + DarkTheme() + } + if (no.legend) { + p3 <- p3 + theme(legend.position = "none") + } + if (no.axes) { + p3 <- p3 + theme( + axis.line = element_blank(), + axis.text.x = element_blank(), + axis.text.y = element_blank(), + axis.ticks = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + panel.background = element_blank(), + panel.border = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + plot.background = element_blank() + ) + } + if (do.identify || do.hover) { + if (do.bare) { + plot.use <- p + } else { + plot.use <- p3 + } + if (do.hover) { + if (is.null(x = data.hover)) { + features.info <- NULL + } else { + features.info <- FetchData(object = object, vars.all = data.hover) + } + return(HoverLocator( + plot = plot.use, + data.plot = data.plot, + features.info = features.info, + dark.theme = dark.theme + )) + } else if (do.identify) { + return(FeatureLocator( + plot = plot.use, + data.plot = data.plot, + dark.theme = dark.theme, + ... + )) + } + } + if (do.return) { + if (do.bare) { + return(p) + } else { + return(p3) + } + } + if (do.bare) { + print(p) + } else { + print(p3) + } +} + +#' Plot PCA map +#' +#' Graphs the output of a PCA analysis +#' Cells are colored by their identity class. +#' +#' This function is a wrapper for DimPlot. See ?DimPlot for a full list of possible +#' arguments which can be passed in here. +#' +#' @param object Seurat object +#' @param \dots Additional parameters to DimPlot, for example, which dimensions to plot. +#' +#' @export +#' +PCAPlot <- function(object, ...) { + return(DimPlot(object = object, reduction.use = "pca", label.size = 6, ...)) +} + +#' Plot Diffusion map +#' +#' Graphs the output of a Diffusion map analysis +#' Cells are colored by their identity class. +#' +#' This function is a wrapper for DimPlot. See ?DimPlot for a full list of possible +#' arguments which can be passed in here. +#' +#' @param object Seurat object +#' @param \dots Additional parameters to DimPlot, for example, which dimensions to plot. +#' +#' @export +DMPlot <- function(object, ...) { + return(DimPlot(object = object, reduction.use = "dm", label.size = 6, ...)) +} + +#' Plot ICA map +#' +#' Graphs the output of a ICA analysis +#' Cells are colored by their identity class. +#' +#' This function is a wrapper for DimPlot. See ?DimPlot for a full list of possible +#' arguments which can be passed in here. +#' +#' @param object Seurat object +#' @param \dots Additional parameters to DimPlot, for example, which dimensions to plot. +#' +#' @export +#' +ICAPlot <- function(object, ...) { + return(DimPlot(object = object, reduction.use = "ica", ...)) +} + +#' Plot tSNE map +#' +#' Graphs the output of a tSNE analysis +#' Cells are colored by their identity class. +#' +#' This function is a wrapper for DimPlot. See ?DimPlot for a full list of possible +#' arguments which can be passed in here. +#' +#' @param object Seurat object +#' @param do.label FALSE by default. If TRUE, plots an alternate view where the center of each +#' cluster is labeled +#' @param pt.size Set the point size +#' @param label.size Set the size of the text labels +#' @param cells.use Vector of cell names to use in the plot. +#' @param colors.use Manually set the color palette to use for the points +#' @param \dots Additional parameters to DimPlot, for example, which dimensions to plot. +#' +#' @seealso DimPlot +#' +#' @export +#' +TSNEPlot <- function( + object, + do.label = FALSE, + pt.size=1, + label.size=4, + cells.use = NULL, + colors.use = NULL, + ... +) { + return(DimPlot( + object = object, + reduction.use = "tsne", + cells.use = cells.use, + pt.size = pt.size, + do.label = do.label, + label.size = label.size, + cols.use = colors.use, + ... + )) +} + +#' Quickly Pick Relevant Dimensions +#' +#' Plots the standard deviations (or approximate singular values if running PCAFast) +#' of the principle components for easy identification of an elbow in the graph. +#' This elbow often corresponds well with the significant dims and is much faster to run than +#' Jackstraw +#' +#' +#' @param object Seurat object +#' @param reduction.type Type of dimensional reduction to plot data for +#' @param dims.plot Number of dimensions to plot sd for +#' @param xlab X axis label +#' @param ylab Y axis label +#' @param title Plot title +#' +#' @return Returns ggplot object +#' +#' @export +#' +DimElbowPlot <- function( + object, + reduction.type = "pca", + dims.plot = 20, + xlab = "", + ylab = "", + title = "" +) { + data.use <- GetDimReduction(object = object, + reduction.type = reduction.type, + slot = "sdev") + if (length(data.use) == 0) { + stop(paste("No standard deviation info stored for", reduction.use)) + } + if (length(x = data.use) < dims.plot) { + warning(paste( + "The object only has information for", + length(x = data.use), + "PCs." + )) + dims.plot <- length(x = data.use) + } + data.use <- data.use[1:dims.plot] + dims <- 1:length(x = data.use) + data.plot <- data.frame(dims, data.use) + plot <- ggplot(data = data.plot, mapping = aes(x = dims, y = data.use)) + + geom_point() + if (reduction.type == "pca") { + plot <- plot + + labs(y = "Standard Deviation of PC", x = "PC", title = title) + } else if(reduction.type == "ica"){ + plot <- plot + + labs(y = "Standard Deviation of IC", x = "IC", title = title) + } else { + plot <- plot + + labs(y = ylab, x = xlab, title = title) + } + return(plot) +} + +#' Quickly Pick Relevant PCs +#' +#' Plots the standard deviations (or approximate singular values if running PCAFast) +#' of the principle components for easy identification of an elbow in the graph. +#' This elbow often corresponds well with the significant PCs and is much faster to run. +#' +#' @param object Seurat object +#' @param num.pc Number of PCs to plot +#' +#' @return Returns ggplot object +#' +#' @export +#' +PCElbowPlot <- function(object, num.pc = 20) { + return(DimElbowPlot( + object = object, + reduction.type = "pca", + dims.plot = num.pc + )) +} + +#' View variable genes +#' +#' @param object Seurat object +#' @param do.text Add text names of variable genes to plot (default is TRUE) +#' @param cex.use Point size +#' @param cex.text.use Text size +#' @param do.spike FALSE by default. If TRUE, color all genes starting with ^ERCC a different color +#' @param pch.use Pch value for points +#' @param col.use Color to use +#' @param spike.col.use if do.spike, color for spike-in genes +#' @param plot.both Plot both the scaled and non-scaled graphs. +#' @param do.contour Draw contour lines calculated based on all genes +#' @param contour.lwd Contour line width +#' @param contour.col Contour line color +#' @param contour.lty Contour line type +#' @param x.low.cutoff Bottom cutoff on x-axis for identifying variable genes +#' @param x.high.cutoff Top cutoff on x-axis for identifying variable genes +#' @param y.cutoff Bottom cutoff on y-axis for identifying variable genes +#' @param y.high.cutoff Top cutoff on y-axis for identifying variable genes +#' +#' @export +#' +VariableGenePlot <- function( + object, + do.text = TRUE, + cex.use = 0.5, + cex.text.use = 0.5, + do.spike = FALSE, + pch.use = 16, + col.use = "black", + spike.col.use = "red", + plot.both = FALSE, + do.contour = TRUE, + contour.lwd = 3, + contour.col = "white", + contour.lty = 2, + x.low.cutoff = 0.1, + x.high.cutoff = 8, + y.cutoff = 1, + y.high.cutoff = Inf +) { + gene.mean <- object@hvg.info[, 1] + gene.dispersion <- object@hvg.info[, 2] + gene.dispersion.scaled <- object@hvg.info[, 3] + names(x = gene.mean) <- names(x = gene.dispersion) <- names(x = gene.dispersion.scaled) <- rownames(x = object@data) + pass.cutoff <- names(x = gene.mean)[which( + x = ( + (gene.mean > x.low.cutoff) & (gene.mean < x.high.cutoff) + ) & + (gene.dispersion.scaled > y.cutoff) & + (gene.dispersion.scaled < y.high.cutoff) + )] + if (do.spike) { + spike.genes <- rownames(x = SubsetRow(data = object@data, code = "^ERCC")) + } + if (plot.both) { + par(mfrow = c(1, 2)) + smoothScatter( + x = gene.mean, + y = gene.dispersion, + pch = pch.use, + cex = cex.use, + col = col.use, + xlab = "Average expression", + ylab = "Dispersion", + nrpoints = Inf + ) + if (do.contour) { + data.kde <- kde2d(x = gene.mean, y = gene.dispersion) + contour( + x = data.kde, + add = TRUE, + lwd = contour.lwd, + col = contour.col, + lty = contour.lty + ) + } + if (do.spike) { + points( + x = gene.mean[spike.genes], + y = gene.dispersion[spike.genes], + pch = 16, + cex = cex.use, + col = spike.col.use + ) + } + if (do.text) { + text( + x = gene.mean[pass.cutoff], + y = gene.dispersion[pass.cutoff], + labels = pass.cutoff, + cex = cex.text.use + ) + } + } + smoothScatter( + x = gene.mean, + y = gene.dispersion.scaled, + pch = pch.use, + cex = cex.use, + col = col.use, + xlab = "Average expression", + ylab = "Dispersion", + nrpoints = Inf + ) + if (do.contour) { + data.kde <- kde2d(x = gene.mean, y = gene.dispersion.scaled) + contour( + x = data.kde, + add = TRUE, + lwd = contour.lwd, + col = contour.col, + lty = contour.lty + ) + } + if (do.spike) { + points( + x = gene.mean[spike.genes], + y = gene.dispersion.scaled[spike.genes], + pch = 16, + cex = cex.use, + col = spike.col.use, + nrpoints = Inf + ) + } + if (do.text) { + text( + x = gene.mean[pass.cutoff], + y = gene.dispersion.scaled[pass.cutoff], + labels = pass.cutoff, + cex = cex.text.use + ) + } +} + +#' Highlight classification results +#' +#' This function is useful to view where proportionally the clusters returned from +#' classification map to the clusters present in the given object. Utilizes the FeaturePlot() +#' function to color clusters in object. +#' +#' @param object Seurat object on which the classifier was trained and +#' onto which the classification results will be highlighted +#' @param clusters vector of cluster ids (output of ClassifyCells) +#' @param ... additional parameters to pass to FeaturePlot() +#' +#' @return Returns a feature plot with clusters highlighted by proportion of cells +#' mapping to that cluster +#' +#' @export +#' +VizClassification <- function(object, clusters, ...) { + cluster.dist <- prop.table(x = table(out)) # What is out? + object@meta.data$Classification <- numeric(nrow(x = object@meta.data)) + for (cluster in 1:length(x = cluster.dist)) { + cells.to.highlight <- WhichCells(object, names(cluster.dist[cluster])) + if (length(x = cells.to.highlight) > 0) { + object@meta.data[cells.to.highlight, ]$Classification <- cluster.dist[cluster] + } + } + if (any(grepl(pattern = "cols.use", x = deparse(match.call())))) { + return(FeaturePlot(object, "Classification", ...)) + } + cols.use = c("#f6f6f6", "black") + return(FeaturePlot(object, "Classification", cols.use = cols.use, ...)) +} + +#' Plot phylogenetic tree +#' +#' Plots previously computed phylogenetic tree (from BuildClusterTree) +#' +#' @param object Seurat object +#' @param \dots Additional arguments for plotting the phylogeny +#' +#' @return Plots dendogram (must be precomputed using BuildClusterTree), returns no value +#' +#' @importFrom ape plot.phylo +#' @importFrom ape nodelabels +#' +#' @export +#' +PlotClusterTree <- function(object, ...) { + if (length(x = object@cluster.tree) == 0) { + stop("Phylogenetic tree does not exist, build using BuildClusterTree") + } + data.tree <- object@cluster.tree[[1]] + plot.phylo(x = data.tree, direction = "downwards", ...) + nodelabels() +} + +#' Color tSNE Plot Based on Split +#' +#' Returns a tSNE plot colored based on whether the cells fall in clusters +#' to the left or to the right of a node split in the cluster tree. +#' +#' @param object Seurat object +#' @param node Node in cluster tree on which to base the split +#' @param color1 Color for the left side of the split +#' @param color2 Color for the right side of the split +#' @param color3 Color for all other cells +#' @inheritDotParams TSNEPlot -object +#' @return Returns a tSNE plot +#' @export +ColorTSNESplit <- function( + object, + node, + color1 = "red", + color2 = "blue", + color3 = "gray", + ... +) { + tree <- object@cluster.tree[[1]] + split <- tree$edge[which(x = tree$edge[,1] == node), ][, 2] + all.children <- DFT( + tree = tree, + node = tree$edge[,1][1], + only.children = TRUE + ) + left.group <- DFT(tree = tree, node = split[1], only.children = TRUE) + right.group <- DFT(tree = tree, node = split[2], only.children = TRUE) + if (any(is.na(x = left.group))) { + left.group <- split[1] + } + if (any(is.na(x = right.group))) { + right.group <- split[2] + } + remaining.group <- setdiff(x = all.children, y = c(left.group, right.group)) + left.cells <- WhichCells(object = object, ident = left.group) + right.cells <- WhichCells(object = object, ident = right.group) + remaining.cells <- WhichCells(object = object, ident = remaining.group) + object <- SetIdent( + object = object, + cells.use = left.cells, + ident.use = "Left Split" + ) + object <- SetIdent( + object = object, + cells.use = right.cells, + ident.use = "Right Split" + ) + object <- SetIdent( + object = object, + cells.use = remaining.cells, + ident.use = "Not in Split" + ) + colors.use = c(color1, color3, color2) + return(TSNEPlot(object = object, colors.use = colors.use, ...)) +} + +#' Plot k-means clusters +#' +#' @param object A Seurat object +#' @param cells.use Cells to include in the heatmap +#' @param genes.cluster Clusters to include in heatmap +#' @param max.genes Maximum number of genes to include in the heatmap +#' @param slim.col.labels Instead of displaying every cell name on the heatmap, +#' display only the identity class name once for each group +#' @param remove.key Removes teh color key from the plot +#' @param row.lines Color separations of clusters +#' @param ... Extra parameters to DoHeatmap +#' +#' @seealso \code{\link{DoHeatmap}} +#' +#' @export +#' +KMeansHeatmap <- function( + object, + cells.use = object@cell.names, + genes.cluster = NULL, + max.genes = 1e6, + slim.col.label = TRUE, + remove.key = TRUE, + row.lines = TRUE, + ... +) { + genes.cluster <- SetIfNull( + x = genes.cluster, + default = unique(x = object@kmeans@gene.kmeans.obj$cluster) + ) + genes.use <- GenesInCluster( + object = object, + cluster.num = genes.cluster, + max.genes = max.genes + ) + cluster.lengths <- sapply( + X = genes.cluster, + FUN = function(x) { + return(length(x = GenesInCluster(object = object, cluster.num = x))) + } + ) + #print(cluster.lengths) + if (row.lines) { + rowsep.use <- cumsum(x = cluster.lengths) + } else { + rowsep.use <- NA + } + DoHeatmap( + object = object, + cells.use = cells.use, + genes.use = genes.use, + slim.col.label = slim.col.label, + remove.key = remove.key, + rowsep = rowsep.use, + ... + ) +} + +#' Node Heatmap +#' +#' Takes an object, a marker list (output of FindAllMarkers), and a node +#' and plots a heatmap where genes are ordered vertically by the splits present +#' in the object@@cluster.tree slot. +#' +#' @param object Seurat object. Must have the cluster.tree slot filled (use BuildClusterTree) +#' @param marker.list List of marker genes given from the FindAllMarkersNode function +#' @param node Node in the cluster tree from which to start the plot, defaults to highest node in marker list +#' @param max.genes Maximum number of genes to keep for each division +#' @param ... Additional parameters to pass to DoHeatmap +#' +#' @importFrom dplyr %>% group_by filter top_n select +#' +#' @return Plots heatmap. No return value. +#' +#' @export +#' +NodeHeatmap <- function(object, marker.list, node = NULL, max.genes = 10, ...) { + tree <- object@cluster.tree[[1]] + node <- SetIfNull(x = node, default = min(marker.list$cluster)) + node.order <- c(node, DFT(tree = tree, node = node)) + marker.list$rank <- seq(1:nrow(x = marker.list)) + marker.list %>% group_by(cluster) %>% filter(avg_diff > 0) %>% + top_n(max.genes, -rank) %>% select(gene, cluster) -> pos.genes + marker.list %>% group_by(cluster) %>% filter(avg_diff < 0) %>% + top_n(max.genes, -rank) %>% select(gene, cluster) -> neg.genes + gene.list <- vector() + node.stack <- vector() + for (n in node.order) { + if (NodeHasChild(tree = tree, node = n)) { + gene.list <- c( + gene.list, + c( + subset(x = pos.genes, subset = cluster == n)$gene, + subset(x = neg.genes, subset = cluster == n)$gene + ) + ) + if (NodeHasOnlyChildren(tree = tree, node = n)) { + gene.list <- c( + gene.list, + subset(x = neg.genes, subset = cluster == node.stack[length(node.stack)])$gene + ) + node.stack <- node.stack[-length(x = node.stack)] + } + } + else { + gene.list <- c(gene.list, subset(x = pos.genes, subset = cluster == n)$gene) + node.stack <- append(x = node.stack, values = n) + } + } + #gene.list <- rev(unique(rev(gene.list))) + descendants <- GetDescendants(tree = tree, node = node) + children <- descendants[!descendants %in% tree$edge[, 1]] + all.children <- tree$edge[,2][!tree$edge[,2] %in% tree$edge[, 1]] + DoHeatmap( + object = object, + cells.use = WhichCells(object = object, ident = children), + genes.use = gene.list, + slim.col.label = TRUE, + remove.key = TRUE, + ... + ) +} diff --git a/R/plotting_internal.R b/R/plotting_internal.R new file mode 100644 index 000000000..196911558 --- /dev/null +++ b/R/plotting_internal.R @@ -0,0 +1,1233 @@ +# Create a scatterplot with data from a ggplot2 scatterplot +# +# @param plot.data The original ggplot2 scatterplot data +# This is taken from ggplot2::ggplot_build +# @param dark.theme Plot using a dark theme +# @param smooth Use a smooth scatterplot instead of a standard scatterplot +# @param ... Extra parameters passed to graphics::plot or graphics::smoothScatter +# +PlotBuild <- function(plot.data, dark.theme = FALSE, smooth = FALSE, ...) { + # Do we use a smooth scatterplot? + # Take advantage of functions as first class objects + # to dynamically choose normal vs smooth scatterplot + if (smooth) { + myplot <- smoothScatter + } else { + myplot <- plot + } + if (dark.theme) { + par(bg = 'black') + axes = FALSE + col.lab = 'white' + } else { + axes = 'TRUE' + col.lab = 'black' + } + myplot( + plot.data[, c(1, 2)], + col = plot.data$color, + pch = plot.data$pch, + cex = vapply( + X = plot.data$cex, + FUN = function(x) { + return(max(x / 2, 0.5)) + }, + FUN.VALUE = numeric(1) + ), + axes = axes, + col.lab = col.lab, + col.main = col.lab, + ... + ) + if (dark.theme) { + axis( + side = 1, + at = NULL, + labels = TRUE, + col.axis = col.lab, + col = col.lab + ) + axis( + side = 2, + at = NULL, + labels = TRUE, + col.axis = col.lab, + col = col.lab + ) + } +} + +# Convert a ggplot2 scatterplot to base R graphics +# +# @param plot A ggplot2 scatterplot +# @param do.plot Create the plot with base R graphics +# @param ... Extra parameters passed to PlotBuild +# +# @return A dataframe with the data that created the ggplot2 scatterplot +# +GGpointToBase <- function(plot, do.plot = TRUE, ...) { + plot.build <- ggplot2::ggplot_build(plot = plot) + build.data <- plot.build$data[[1]] + plot.data <- build.data[, c('x', 'y', 'colour', 'shape', 'size')] + names(x = plot.data) <- c( + plot.build$plot$labels$x, + plot.build$plot$labels$y, + 'color', + 'pch', + 'cex' + ) + if (do.plot) { + PlotBuild(plot.data = plot.data, ...) + } + return(plot.data) +} + +# Locate points on a plot and return them +# +# @param plot A ggplot2 plot +# @param recolor Do we recolor the plot to highlight selected points? +# @param dark.theme Plot using a dark theme +# @param ... Exptra parameters to PlotBuild +# +# @return A dataframe of x and y coordinates for points selected +# +PointLocator <- function(plot, recolor=TRUE, dark.theme = FALSE, ...) { + # Convert the ggplot object to a data.frame + plot.data <- GGpointToBase(plot = plot, dark.theme = dark.theme, ...) + npoints <- nrow(x = plot.data) + cat("Click around the cluster of points you wish to select\n") + cat("ie. select the vertecies of a shape around the cluster you\n") + cat("are interested in. Press when finished (right click for R-terminal users)\n\n") + polygon <- locator(n = npoints, type = 'l') + polygon <- data.frame(polygon) + # pnt.in.poly returns a data.frame of points + points.all <- SDMTools::pnt.in.poly( + pnts = plot.data[, c(1, 2)], + poly.pnts = polygon + ) + # Find the located points + points.located <- points.all[which(x = points.all$pip == 1), ] + # If we're recoloring, do the recolor + if(recolor) { + if (dark.theme) { + no = 'white' + } else { + no = 'black' + } + points.all$color <- ifelse(test = points.all$pip == 1, yes = 'red', no = no) + plot.data$color <- points.all$color + PlotBuild(plot.data = plot.data, dark.theme = dark.theme, ...) + } + return(points.located[, c(1, 2)]) +} + +# Plot a single feature +# +# @param data.use The data regarding the feature +# @param feature The feature to plot +# @param data.plot The data to be plotted +# @param pt.size Size of each point +# @param pch.use Shape of each point +# @param cols.use Colors to plot +# @param dim.codes Codes for the dimensions to plot in +# @param min.cutoff Minimum cutoff for data +# @param max.cutoff Maximum cutoff for data +# @param no.axes Remove axes from plot +# @param no.legend Remove legend from plot +# @param dark.theme Plot in dark theme +# +# @return A ggplot2 scatterplot +# +SingleFeaturePlot <- function( + data.use, + feature, + data.plot, + pt.size, + pch.use, + cols.use, + dim.codes, + min.cutoff, + max.cutoff, + no.axes, + no.legend, + dark.theme +) { + data.gene <- na.omit(object = data.frame(data.use[feature, ])) + # Check for quantiles + min.cutoff <- SetQuantile(cutoff = min.cutoff, data = data.gene) + max.cutoff <- SetQuantile(cutoff = max.cutoff, data = data.gene) + # Mask any values below the minimum and above the maximum values + data.gene <- sapply( + X = data.gene, + FUN = function(x) { + return(ifelse(test = x < min.cutoff, yes = min.cutoff, no = x)) + } + ) + data.gene <- sapply( + X = data.gene, + FUN = function(x) { + return(ifelse(test = x > max.cutoff, yes = max.cutoff, no = x)) + } + ) + data.plot$gene <- data.gene + # Stuff for break points + if (length(x = cols.use) == 1) { + brewer.gran <- brewer.pal.info[cols.use, ]$maxcolors + } else { + brewer.gran <- length(x = cols.use) + } + # Cut points + if (all(data.gene == 0)) { + data.cut <- 0 + } else { + data.cut <- as.numeric(x = as.factor(x = cut( + x = as.numeric(x = data.gene), + breaks = brewer.gran + ))) + } + data.plot$col <- as.factor(x = data.cut) + # Start plotting + p <- ggplot(data = data.plot, mapping = aes(x = x, y = y)) + if (brewer.gran != 2) { + if (length(x = cols.use) == 1) { + p <- p + geom_point( + mapping = aes(color = col), + size = pt.size, + shape = pch.use + ) + scale_color_brewer(palette = cols.use) + } else { + p <- p + geom_point( + mapping = aes(color = col), + size = pt.size, + shape = pch.use + ) + scale_color_manual(values = cols.use) + } + } else { + if (all(data.plot$gene == data.plot$gene[1])) { + warning(paste0("All cells have the same value of ", feature, ".")) + p <- p + geom_point(color = cols.use[1], size = pt.size, shape = pch.use) + } else { + p <- p + geom_point( + mapping = aes(color = gene), + size = pt.size, + shape = pch.use + ) + scale_color_gradientn( + colors = cols.use, + guide = guide_colorbar(title = feature) + ) + } + } + if (no.axes) { + p <- p + labs(title = feature, x ="", y="") + theme( + axis.line = element_blank(), + axis.text.x = element_blank(), + axis.text.y = element_blank(), + axis.ticks = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank() + ) + } else { + p <- p + labs(title = feature, x = dim.codes[1], y = dim.codes[2]) + } + if (no.legend) { + p <- p + theme(legend.position = 'none') + } + if (dark.theme) { + p <- p + DarkTheme() + } + return(p) +} + +# Blend two feature plots together +# +# @param data.use The data regarding the feature +# @param features.plot The features to plot +# @param data.plot The data to be plotted +# @param pt.size Size of each point +# @param pch.use Shape of each point +# @param cols.use Colors to plot +# @param dim.codes Codes for the dimensions to plot in +# @param min.cutoff Minimum cutoff for data +# @param max.cutoff Maximum cutoff for data +# @param no.axes Remove axes from plot +# @param no.legend Remove legend from plot +# @param dark.theme Plot in dark theme +# +# @return A blended ggplot2 scatterplot +# +BlendPlot <- function( + data.use, + features.plot, + data.plot, + pt.size, + pch.use, + cols.use, + dim.codes, + min.cutoff, + max.cutoff, + no.axes, + no.legend, + dark.theme +) { + num.cols <- length(x = cols.use) + # Create a vector of colors that weren't provided + cols.not.provided <- colors(distinct = TRUE) + cols.not.provided <- cols.not.provided[!(grepl( + pattern = paste(cols.use, collapse = '|'), + x = cols.not.provided, + ignore.case = TRUE + ))] + if (num.cols > 4) { + # If provided more than four colors, take only the first four + cols.use <- cols.use[c(1:4)] + } else if ((num.cols == 2) || (num.cols == 3)) { + # If two or three colors, use the last two as high values for blending + # and add to our vector of colors + blend <- BlendColors(cols.use[c(num.cols - 1, num.cols)]) + cols.use <- c(cols.use, blend) + if (num.cols == 2) { + # If two colors, provided, + # we still need a low color + cols.use <- c(sample(x = cols.not.provided, size = 1), cols.use) + } + } else if ((num.cols == 1)) { + # If only one color provided + if (cols.use %in% rownames(x = brewer.pal.info)) { + # Was it a palette from RColorBrewer? If so, create + # our colors based on the palette + palette <- brewer.pal(n = 3, name = cols.use) + cols.use <- c(palette, BlendColors(palette[c(2, 3)])) + } else { + # If not, randomly create our colors + cols.high <- sample(x = cols.not.provided, size = 2, replace = FALSE) + cols.use <- c(cols.use, cols.high, BlendColors(cols.high)) + } + } else if (num.cols <= 0) { + cols.use <- c('yellow','red', 'blue', BlendColors('red', 'blue')) + } + names(x = cols.use) <- c('low', 'high1', 'high2', 'highboth') + length.check <- vapply( + X = list(features.plot, min.cutoff, max.cutoff), + FUN = function(x) { + return(length(x = x) != 2) + }, + FUN.VALUE = logical(length = 1) + ) + if (any(length.check)) { + stop("An overlayed FeaturePlot only works with two features and requires two minimum and maximum cutoffs") + } + # Check for quantiles + min.cutoff <- c( + SetQuantile(cutoff = min.cutoff[1], data = data.gene[features.plot[1], ]), + SetQuantile(cutoff = min.cutoff[2], data = data.gene[features.plot[2], ]) + ) + max.cutoff <- c( + SetQuantile(cutoff = max.cutoff[1], data = data.gene[features.plot[1], ]), + SetQuantile(cutoff = max.cutoff[2], data = data.gene[features.plot[2], ]) + ) + data.gene <- na.omit(object = data.frame(data.use[features.plot, ])) + cell.names <- colnames(x = data.gene) + # Minimum and maximum masking + data.gene <- matrix( + data = vapply( + X = data.gene, + FUN = function(x) ifelse(test = x < min.cutoff, yes = min.cutoff, no = x), + FUN.VALUE = c(1, 1) + ), + nrow = 2 + ) + data.gene <- matrix( + data = vapply( + X = as.data.frame(x = data.gene), + FUN = function(x) ifelse(test = x > max.cutoff, yes = max.cutoff, no = x), + FUN.VALUE = c(1, 1) + ), + nrow = 2 + ) + data.gene <- as.data.frame(x = data.gene) + rownames(x = data.gene) <- features.plot + colnames(x = data.gene) <- cell.names + # Stuff for break points + if(all(data.gene ==0)) { + data.cut <- 0 + } else { + # Cut the expression of both features + cuts <- apply( + X = data.gene, + MARGIN = 1, + FUN = cut, + breaks = 2, + labels = FALSE + ) + cuts.dim <- dim(x = cuts) + if (cuts.dim[1] > cuts.dim[2]){ + cuts <- t(x = cuts) + } + # Apply colors dependent on if the cell expresses + # none, one, or both features + data.cut = apply( + X = cuts, + MARGIN = 2, + FUN = function(x) { + return(if ((x[1] == 1) && (x[2] == 2)) { # Expression in 2 + 'high2' + } else if ((x[1] == 2) && (x[2] == 1)) { # Expression in 1 + 'high1' + } else if ((x[1] == 2) && (x[2] == 2)) { # Expression in both + 'highboth' + } else { # Expression in neither + 'low' + }) + } + ) + data.cut <- as.factor(x = data.cut) + } + data.plot$colors <- data.cut + # Start plotting + legend.names <- c( + 'high1' = paste('High', features.plot[1]), + 'high2' = paste('High', features.plot[2]), + 'highboth' = 'High both' + ) + title <- paste0(features.plot, collapse = ' x ') + p <- ggplot(data = data.plot, mapping = aes(x = x, y = y)) + p <- p + geom_point( + mapping = aes(color = colors), + size = pt.size, + shape = pch.use + ) + p <- p + scale_color_manual( + values = cols.use, + limits = c('high1', 'high2', 'highboth'), + labels = legend.names, + guide = guide_legend(title = NULL, override.aes = list(size = 2)) + ) + # Deal with axes and legends + if (no.axes) { + p <- p + labs(title = title, x ="", y="") + theme( + axis.line = element_blank(), + axis.text.x = element_blank(), + axis.text.y = element_blank(), + axis.ticks = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank() + ) + } else { + p <- p + labs(title = title, x = dim.codes[1], y = dim.codes[2]) + } + if (no.legend){ + p <- p + theme(legend.position = 'none') + } + if (dark.theme) { + p <- p + DarkTheme() + } + return(p) +} + +# Blend two or more colors together +# +# @param ... Two or more colors to blend together +# These can be in a vector or standalone +# @param as.rgb Return in RGB form, otherwise return in hexadecimal form +# +# @return The blended color in RGB form (1 x 3 matrix) or hexadecimal form +# +BlendColors <- function(..., as.rgb = FALSE) { + # Assemble the arguments passed into a character vector + colors <- as.character(x = c(...)) + if (length(x = colors) < 2) { + stop("Please provide two or more colors to blend") + } + # Check for hexadecimal values for automatic alpha blending + alpha.value <- 255 + if (sum(sapply(X = colors, FUN = grepl, pattern = '^#')) != 0) { + hex <- colors[which(x = grepl(pattern = '^#', x = colors))] + hex.length <- sapply(X = hex, FUN = nchar) + # 9-character hexadecimal values specify alpha levels + if (9 %in% hex.length) { + hex.alpha <- hex[which(x = hex.length == 9)] + hex.vals <- sapply(X = hex.alpha, FUN = substr, start = 8, stop = 9) + dec.vals <- sapply(X = hex.vals, FUN = strtoi, base = 16) + dec.vals <- dec.vals / 255 # Convert to 0:1 scale for calculations + alpha.value <- dec.vals[1] + # Blend alpha levels, going top-down + for (val in dec.vals[-1]) { + alpha.value <- alpha.value + (val * (1 - alpha.value)) + } + alpha.value <- alpha.value * 255 # Convert back to 0:255 scale + } + } + # Convert to a 3 by `length(colors)` matrix of RGB values + rgb.vals <- sapply(X = colors, FUN = col2rgb) + if (nrow(x = rgb.vals) != 3) { + rgb.vals <- t(x = rgb.vals) + } + # Blend together using the additive method + # Basically, resulting colors are the mean of the component colors + blend <- apply( + X = rgb.vals, + MARGIN = 1, + FUN = mean + ) + # If we're returning RGB values, convert to matrix, just like col2rgb + # Otherwise, return as hexadecimal; can be used directly for plotting + if (as.rgb) { + result <- matrix( + data = blend, + nrow = 3, + dimnames = list(c('red', 'green', 'blue'), 'blend') + ) + } else { + result <- rgb( + matrix(data = blend, ncol = 3), + alpha = alpha.value, + maxColorValue = 255 + ) + } + return(result) +} + +# Find the quantile of a data +# +# Converts a quantile in character form to a number regarding some data +# String form for a quantile is represented as a number prefixed with 'q' +# For example, 10th quantile is 'q10' while 2nd quantile is 'q2' +# +# Will only take a quantile of non-zero data values +# +# @param cutoff The cutoff to turn into a quantile +# @param data The data to turn find the quantile of +# +# @return The numerical representation of the quantile +# +SetQuantile <- function(cutoff, data) { + if (grepl(pattern = '^q[0-9]{1,2}$', x = as.character(x = cutoff), perl = TRUE)) { + this.quantile <- as.numeric(x = sub( + pattern = 'q', + replacement = '', + x = as.character(x = cutoff) + )) / 100 + data <- unlist(x = data) + data <- data[data > 0] + cutoff <- quantile(x = data, probs = this.quantile) + } + return(as.numeric(x = cutoff)) +} + +# No Grid +# +# Remove the grid lines from a ggplot2 plot +# +# @param ... Extra parameters to be passed to theme() +# @import ggplot2 +# @return A ggplot2 theme object +# @seealso \code{\link{theme}} +# @import ggplot2 +# @export +# +NoGrid <- function(...) { + no.grid <- theme( + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + ... + ) + return(no.grid) +} + +# Reset Par +# +# Reset the graphing space to +# mfrow = c(1, 1) +# +# @param ... Extra parameters for par +# +ResetPar <- function(...) { + par(mfrow = c(1, 1), ...) +} + +# Plot a single feature on a violin plot +# +# @param feature Feature to plot +# @param data Data to plot +# @param cell.ident Idents to use +# @param do.sort Sort identity classes (on the x-axis) by the average +# expression of the attribute being potted +# @param y.max Maximum Y value to plot +# @param size.x.use X axis title font size +# @param size.y.use Y axis title font size +# @param size.title.use Main title font size +# @param adjust.use Adjust parameter for geom_violin +# @param point.size.use Point size for geom_violin +# @param cols.use Colors to use for plotting +# @param gene.names +# @param y.log plot Y axis on log scale +# @param x.lab.rot Rotate x-axis labels +# @param y.lab.rot Rotate y-axis labels +# @param legend.position Position the legend for the plot +# @param remove.legend Remove the legend from the plot +# +# @return A ggplot-based violin plot +# +SingleVlnPlot <- function( + feature, + data, + cell.ident, + do.sort, + y.max, + size.x.use, + size.y.use, + size.title.use, + adjust.use, + point.size.use, + cols.use, + gene.names, + y.log, + x.lab.rot, + y.lab.rot, + legend.position, + remove.legend +) { + set.seed(seed = 42) + data$ident <- cell.ident + if (do.sort) { + data$ident <- factor( + x = data$ident, + levels = names(x = rev(x = sort(x = tapply( + X = data[, feature], + INDEX = data$ident, + FUN = mean + )))) + ) + } + if (y.log) { + noise <- rnorm(n = length(x = data[, feature])) / 200 + data[, feature] <- data[, feature] + 1 + } else { + noise <- rnorm(n = length(x = data[, feature])) / 100000 + } + data[, feature] <- data[, feature] + noise + y.max <- SetIfNull(x = y.max, default = max(data[, feature])) + plot <- ggplot( + data = data, + mapping = aes( + x = factor(x = ident), + y = eval(expr = parse(text = feature)) + ) + ) + + geom_violin( + scale = "width", + adjust = adjust.use, + trim = TRUE, + mapping = aes(fill = factor(x = ident)) + ) + + theme( + legend.position = legend.position, + axis.title.x = element_text( + face = "bold", + colour = "#990000", + size = size.x.use + ), + axis.title.y = element_text( + face = "bold", + colour = "#990000", + size = size.y.use + ) + ) + + guides(fill = guide_legend(title = NULL)) + + geom_jitter(height = 0, size = point.size.use) + + xlab("Cell Type") + + NoGrid() + + ggtitle(feature) + + theme(plot.title = element_text(size = size.title.use, face = "bold")) + if (y.log) { + plot <- plot + scale_y_log10() + } else { + plot <- plot + ylim(min(data[, feature]), y.max) + } + if (feature %in% gene.names) { + if (y.log) { + plot <- plot + ylab(label = "Log Expression level") + } else { + plot <- plot + ylab(label = "Expression level") + } + } else { + plot <- plot + ylab(label = "") + } + if (! is.null(x = cols.use)) { + plot <- plot + scale_fill_manual(values = cols.use) + } + if (x.lab.rot) { + plot <- plot + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + } + if (y.lab.rot) { + plot <- plot + theme(axis.text.x = element_text(angle = 90)) + } + if (remove.legend) { + plot <- plot + theme(legend.position = "none") + } + return(plot) +} + +#remove legend title +no.legend.title <- theme(legend.title = element_blank()) + +#set legend text +SetLegendTextGG <- function(x = 12, y = "bold") { + return(theme(legend.text = element_text(size = x, face = y))) +} + +#set legend point size +SetLegendPointsGG <- function(x = 6) { + return(guides(colour = guide_legend(override.aes = list(size = x)))) +} + +#set x axis features +SetXAxisGG <- function(x = 16, y = "#990000", z = "bold", x2 = 12) { + return(theme( + axis.title.x = element_text(face = z, colour = y, size = x), + axis.text.x = element_text(angle = 90, vjust = 0.5, size = x2) + )) +} + +#set y axis features +SetYAxisGG <- function(x = 16, y = "#990000", z = "bold", x2 = 12) { + return(theme( + axis.title.y = element_text(face = z, colour = y, size = x), + axis.text.y = element_text(angle = 90, vjust = 0.5, size = x2) + )) +} + +#heatmap.2, but does not draw a key. +#unclear if this is necessary, but valuable to have the function coded in for modifications +heatmap2NoKey <- function ( + x, + Rowv = TRUE, + Colv = if (symm) "Rowv" else TRUE, + distfun = dist, + hclustfun = hclust, + dendrogram = c("both", "row", "column", "none"), + symm = FALSE, + scale = c("none", "row", "column"), + na.rm = TRUE, + revC = identical(x = Colv, y = "Rowv"), + add.expr, + breaks, + symbreaks = min(x < 0, na.rm = TRUE) || scale != "none", + col = "heat.colors", + colsep, + rowsep, + sepcolor = "white", + sepwidth = c(0.05, 0.05), + cellnote, + notecex = 1, + notecol = "cyan", + na.color = par("bg"), + trace = c("column", "row", "both", "none"), + tracecol = "cyan", + hline = median(breaks), + vline = median(breaks), + linecol = tracecol, + margins = c(5, 5), + ColSideColors, + RowSideColors, + cexRow = 0.2 + 1 / log10(x = nr), + cexCol = 0.2 + 1 / log10(x = nc), + labRow = NULL, + labCol = NULL, + key = TRUE, + keysize = 1.5, + density.info = c("histogram", "density", "none"), + denscol = tracecol, + symkey = min(x < 0, na.rm = TRUE) || symbreaks, + densadj = 0.25, + main = NULL, + xlab = NULL, + ylab = NULL, + lmat = NULL, + lhei = NULL, + axRowCol="black", + lwid = NULL, + dimTitle = NULL, + ... +) { + scale01 <- function(x, low = min(x), high = max(x)) { + return((x - low) / (high - low)) + } + retval <- list() + scale <- if (symm && missing(x = scale)) { + "none" + } else { + match.arg(arg = scale) + } + dendrogram <- match.arg(arg = dendrogram) + trace <- match.arg(arg = trace) + density.info <- match.arg(density.info) + if (length(x = col) == 1 && is.character(x = col)) { + col <- get(col, mode = "function") + } + if (! missing(x = breaks) && (scale != "none")) { + warning( + "Using scale=\"row\" or scale=\"column\" when breaks are", + "specified can produce unpredictable results.", + "Please consider using only one or the other." + ) + } + if (is.null(x = Rowv) || is.na(x = Rowv)) { + Rowv <- FALSE + } + if (is.null(x = Colv) || is.na(x = Colv)) { + Colv <- FALSE + } else if (Colv == "Rowv" && !isTRUE(x = Rowv)) { + Colv <- FALSE + } + if (length(x = di <- dim(x = x)) != 2 || !is.numeric(x = x)) { + stop("`x' must be a numeric matrix") + } + nr <- di[1] + nc <- di[2] + if (nr <= 1 || nc <= 1) { + stop("`x' must have at least 2 rows and 2 columns") + } + if (! is.numeric(x = margins) || length(x = margins) != 2) { + stop("`margins' must be a numeric vector of length 2") + } + if (missing(x = cellnote)) { + cellnote <- matrix(data = "", ncol = ncol(x = x), nrow = nrow(x = x)) + } + if (! inherits(x = Rowv, what = "dendrogram")) { + if (((! isTRUE(x = Rowv)) || (is.null(x = Rowv))) && + (dendrogram %in% c("both", "row"))) { + if (is.logical(x = Colv) && (Colv)) { + dendrogram <- "column" + } else { + dedrogram <- "none" + } + } + } + if (! inherits(x = Colv, what = "dendrogram")) { + if (((!isTRUE(x = Colv)) || (is.null(x = Colv))) && + (dendrogram %in% c("both", "column"))) { + if (is.logical(x = Rowv) && (Rowv)) { + dendrogram <- "row" + } else { + dendrogram <- "none" + } + } + } + if (inherits(x = Rowv, what = "dendrogram")) { + ddr <- Rowv + rowInd <- order.dendrogram(x = ddr) + } else if (is.integer(x = Rowv)) { + hcr <- hclustfun(distfun(x)) + ddr <- as.dendrogram(object = hcr) + ddr <- reorder(x = ddr, X = Rowv) + rowInd <- order.dendrogram(x = ddr) + if (nr != length(x = rowInd)) { + stop("row dendrogram ordering gave index of wrong length") + } + } else if (isTRUE(x = Rowv)) { + Rowv <- rowMeans(x = x, na.rm = na.rm) + hcr <- hclustfun(distfun(x)) + ddr <- as.dendrogram(object = hcr) + ddr <- reorder(x = ddr, X = Rowv) + rowInd <- order.dendrogram(x = ddr) + if (nr != length(x = rowInd)) { + stop("row dendrogram ordering gave index of wrong length") + } + } else { + rowInd <- nr:1 + } + if (inherits(x = Colv, what = "dendrogram")) { + ddc <- Colv + colInd <- order.dendrogram(x = ddc) + } else if (identical(x = Colv, y = "Rowv")) { + if (nr != nc) { + stop("Colv = \"Rowv\" but nrow(x) != ncol(x)") + } + if (exists(x = "ddr")) { + ddc <- ddr + colInd <- order.dendrogram(x = ddc) + } else { + colInd <- rowInd + } + } else if (is.integer(x = Colv)) { + hcc <- hclustfun(distfun(if (symm) { + x + } else { + t(x = x) + })) + ddc <- as.dendrogram(object = hcc) + ddc <- reorder(x = ddc, X = Colv) + colInd <- order.dendrogram(x = ddc) + if (nc != length(x = colInd)) { + stop("column dendrogram ordering gave index of wrong length") + } + } else if (isTRUE(x = Colv)) { + Colv <- colMeans(x = x, na.rm = na.rm) + hcc <- hclustfun(distfun(if (symm) { + x + } else { + t(x = x) + })) + ddc <- as.dendrogram(object = hcc) + ddc <- reorder(x = ddc, X = Colv) + colInd <- order.dendrogram(x = ddc) + if (nc != length(x = colInd)) { + stop("column dendrogram ordering gave index of wrong length") + } + } else { + colInd <- 1:nc + } + retval$rowInd <- rowInd + retval$colInd <- colInd + retval$call <- match.call() + x <- x[rowInd, colInd] + x.unscaled <- x + cellnote <- cellnote[rowInd, colInd] + if (is.null(x = labRow)) { + labRow <- if (is.null(rownames(x = x))) { + (1:nr)[rowInd] + } else { + rownames(x = x) + } + } else { + labRow <- labRow[rowInd] + } + if (is.null(x = labCol)) { + labCol <- if (is.null(x = colnames(x = x))) { + (1:nc)[colInd] + } else { + colnames(x = x) + } + } else { + labCol <- labCol[colInd] + } + if (scale == "row") { + retval$rowMeans <- rm <- rowMeans(x = x, na.rm = na.rm) + x <- sweep(x = x, MARGIN = 1, STATS = rm) + retval$rowSDs <- sx <- apply(X = x, MARGIN = 1, FUN = sd, na.rm = na.rm) + x <- sweep(x = x, MARGIN = 1, STATS = sx, FUN = "/") + } else if (scale == "column") { + retval$colMeans <- rm <- colMeans(x = x, na.rm = na.rm) + x <- sweep(x = x, MARGIN = 2, STATS = rm) + retval$colSDs <- sx <- apply(X = x, MARGIN = 2, FUN = sd, na.rm = na.rm) + x <- sweep(x = x, MARGIN = 2, STATS = sx, FUN = "/") + } + if (missing(x = breaks) || is.null(x = breaks) || length(x = breaks) <1) { + if (missing(x = col) || is.function(x = col)) { + breaks <- 16 + } else { + breaks <- length(x = col) + 1 + } + } + if (length(x = breaks) == 1) { + if (! symbreaks) { + breaks <- seq( + from = min(x, na.rm = na.rm), + to = max(x, na.rm = na.rm), + length = breaks + ) + } else { + extreme <- max(abs(x = x), na.rm = TRUE) + breaks <- seq(from = -extreme, to = extreme, length = breaks) + } + } + nbr <- length(x = breaks) + ncol <- length(x = breaks) - 1 + if (class(x = col) == "function") { + col <- col(x = ncol) + } + min.breaks <- min(breaks) + max.breaks <- max(breaks) + x[x < min.breaks] <- min.breaks + x[x > max.breaks] <- max.breaks + # if (missing(lhei) || is.null(lhei)) + # lhei <- c(keysize, 4) + # if (missing(lwid) || is.null(lwid)) + # lwid <- c(keysize, 4) + # if (missing(lmat) || is.null(lmat)) { + # lmat <- rbind(4:3, 2:1) + # if (!missing(ColSideColors)) { + # if (!is.character(ColSideColors) || length(ColSideColors) != + # nc) + # stop("'ColSideColors' must be a character vector of length ncol(x)") + # lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + + # 1) + # lhei <- c(lhei[1], 0.2, lhei[2]) + # } + # if (!missing(RowSideColors)) { + # if (!is.character(RowSideColors) || length(RowSideColors) != + # nr) + # stop("'RowSideColors' must be a character vector of length nrow(x)") + # lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - + # 1), 1), lmat[, 2] + 1) + # lwid <- c(lwid[1], 0.2, lwid[2]) + # } + # lmat[is.na(lmat)] <- 0 + # } + # if (length(lhei) != nrow(lmat)) + # stop("lhei must have length = nrow(lmat) = ", nrow(lmat)) + # if (length(lwid) != ncol(lmat)) + # stop("lwid must have length = ncol(lmat) =", ncol(lmat)) + # op <- par(no.readonly = TRUE) + # on.exit(par(op)) + # layout(lmat, widths = lwid, heights = lhei, respect = FALSE) + + if (! missing(x = RowSideColors)) { + par(mar = c(margins[1], 0, 0, 0.5)) + image(x = rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE) + } + if (! missing(x = ColSideColors)) { + par(mar = c(0.5, 0, 0, margins[2])) + image(x = cbind(1:nc), col = ColSideColors[colInd], axes = FALSE) + } + oldMar <- par()$mar + if (labCol[1] == "") { + par(mar = c(margins[1]-3, margins[2]-2, margins[1]-3, margins[2])) + } else { + par(mar = c(margins[1], margins[2], margins[1], margins[2])) + } + x <- t(x = x) + cellnote <- t(x = cellnote) + if (revC) { + iy <- nr:1 + if (exists(x = "ddr")) { + ddr <- rev(x = ddr) + } + x <- x[, iy] + cellnote <- cellnote[, iy] + } else { + iy <- 1:nr + } + # add pc number as title if plotting pc heatmaps + if(is.null(x = dimTitle)) { + dimTitle <- "" + } + #print(dimTitle) + image( + x = 1:nc, + y = 1:nr, + z = x, + xlim = 0.5 + c(0, nc), + ylim = 0.5 + c(0, nr), + axes = FALSE, + xlab = "", + ylab = "", + main = dimTitle, + col = col, + breaks = breaks, + ... + ) + retval$carpet <- x + if (exists(x = "ddr")) { + retval$rowDendrogram <- ddr + } + if (exists(x = "ddc")) { + retval$colDendrogram <- ddc + } + retval$breaks <- breaks + retval$col <- col + if (any(is.na(x = x))) { + mmat <- ifelse(test = is.na(x = x), yes = 1, no = NA) + image( + x = 1:nc, + y = 1:nr, + z = mmat, + axes = FALSE, + xlab = "", + ylab = "", + main = pc_title, + col = na.color, + add = TRUE + ) + } + axis( + side = 1, + at = 1:nc, + labels = labCol, + las = 2, + line = -0.5, + tick = 0, + cex.axis = cexCol + ) + if (! is.null(x = xlab)) { + mtext(text = xlab, side = 1, line = margins[1] - 1.25) + } + axis( + side = 4, + at = iy, + labels = labRow, + las = 2, + line = -0.5, + tick = 0, + cex.axis = cexRow, + col = axRowCol + ) + if (! is.null(x = ylab)) { + mtext(text = ylab, side = 4, line = margins[2] - 1.25) + } + if (! missing(x = add.expr)) { + eval(expr = substitute(expr = add.expr)) + } + if (! missing(x = colsep)) { + for (csep in colsep) { + rect( + xleft = csep + 0.5, + ybottom = rep(x = 0, length(x = csep)), + xright = csep + 0.5 + sepwidth[1], + ytop = rep(x = ncol(x = x) + 1, csep), + lty = 1, + lwd = 1, + col = sepcolor, + border = sepcolor + ) + } + } + if (! missing(x = rowsep)) { + for (rsep in rowsep) { + rect( + xleft = 0, + ybottom = (ncol(x = x) + 1 - rsep) - 0.5, + xright = nrow(x = x) + 1, + ytop = (ncol(x) + 1 - rsep) - 0.5 - sepwidth[2], + lty = 1, + lwd = 1, + col = sepcolor, + border = sepcolor + ) + } + } + min.scale <- min(breaks) + max.scale <- max(breaks) + x.scaled <- scale01(x = t(x = x), low = min.scale, high = max.scale) + if (trace %in% c("both", "column")) { + retval$vline <- vline + vline.vals <- scale01(x = vline, low = min.scale, high = max.scale) + for (i in colInd) { + if (! is.null(x = vline)) { + abline( + v = i - 0.5 + vline.vals, + col = linecol, + lty = 2 + ) + } + xv <- rep(x = i, nrow(x = x.scaled)) + x.scaled[, i] - 0.5 + xv <- c(xv[1], xv) + yv <- 1:length(x = xv) - 0.5 + ##lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") + } + } + if (trace %in% c("both", "row")) { + retval$hline <- hline + hline.vals <- scale01(x = hline, low = min.scale, high = max.scale) + for (i in rowInd) { + if (! is.null(x = hline)) { + abline(h = i + hline, col = linecol, lty = 2) + } + yv <- rep(x = i, ncol(x = x.scaled)) + x.scaled[i, ] - 0.5 + yv <- rev(x = c(yv[1], yv)) + xv <- length(x = yv):1 - 0.5 + ##lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") + } + } + if (! missing(x = cellnote)) { + text( + x = c(row(x = cellnote)), + y = c(col(x = cellnote)), + labels = c(cellnote), + col = notecol, + cex = notecex + ) + } + #par(mar = c(margins[1], 0, 0, 0)) + if (dendrogram %in% c("both", "row")) { + ##plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") + } + ##else plot.new() + #par(mar = c(0, 0, if (!is.null(main)) 5 else 0, margins[2])) + if (dendrogram %in% c("both", "column")) { + ##plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none") + } + ##else plot.new() + key <- FALSE + if (! is.null(x = main)) + title(main = main, cex.main = 1.5 * op[["cex.main"]]) + if (key) { + par(mar = c(5, 4, 2, 1), cex = 0.75) + tmpbreaks <- breaks + if (symkey) { + max.raw <- max(abs(x = c(x, breaks)), na.rm = TRUE) + min.raw <- -max.raw + tmpbreaks[1] <- -max(abs(x = x), na.rm = TRUE) + tmpbreaks[length(x = tmpbreaks)] <- max(abs(x = x), na.rm = TRUE) + } else { + min.raw <- min(x, na.rm = TRUE) + max.raw <- max(x, na.rm = TRUE) + } + z <- seq(from = min.raw, to = max.raw, length = length(x = col)) + #image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks, + # xaxt = "n", yaxt = "n") + par(usr = c(0, 1, 0, 1)) + lv <- pretty(x = breaks) + xv <- scale01(x = as.numeric(x = lv), low = min.raw, high = max.raw) + axis(side = 1, at = xv, labels = lv) + if (scale == "row") { + mtext(side = 1, "Row Z-Score", line = 2) + } + else if (scale == "column") { + mtext(side = 1, "Column Z-Score", line = 2) + } + else { + mtext(side = 1, "Value", line = 2) + } + if (density.info == "density") { + dens <- density(x = x, adjust = densadj, na.rm = TRUE) + omit <- dens$x < min(breaks) | dens$x > max(breaks) + dens$x <- dens$x[-omit] + dens$y <- dens$y[-omit] + dens$x <- scale01(x = dens$x, low = min.raw, high = max.raw) + lines( + x = dens$x, + y = dens$y / max(dens$y) * 0.95, + col = denscol, + lwd = 1 + ) + axis( + side = 2, + at = pretty(x = dens$y) / max(dens$y) * 0.95, + pretty(dens$y) + ) + title(main = "Color Key\nand Density Plot") + par(cex = 0.5) + mtext(side = 2, "Density", line = 2) + } else if (density.info == "histogram") { + h <- hist(x = x, plot = FALSE, breaks = breaks) + hx <- scale01(x = breaks, low = min.raw, high = max.raw) + hy <- c(h$counts, h$counts[length(h$counts)]) + lines( + x = hx, + y = hy / max(hy) * 0.95, + lwd = 1, + type = "s", + col = denscol + ) + axis( + side = 2, + at = pretty(x = hy) / max(hy) * 0.95, + pretty(x = hy) + ) + title(main = "Color Key\nand Histogram") + par(cex = 0.5) + mtext(side = 2, "Count", line = 2) + } else { + title(main = "Color Key") + } + } + ##else plot.new() + retval$colorTable <- data.frame( + low = retval$breaks[-length(x = retval$breaks)], + high = retval$breaks[-1], color = retval$col + ) + invisible(x = retval) + par(mar = oldMar) +} diff --git a/R/plotting_utilities.R b/R/plotting_utilities.R new file mode 100644 index 000000000..22757e16f --- /dev/null +++ b/R/plotting_utilities.R @@ -0,0 +1,223 @@ +#' Dark Theme +#' +#' Add a dark theme to ggplot objects +#' +#' @param ... Extra parameters to be passed to theme() +#' @import ggplot2 +#' @return A ggplot2 theme object +#' @seealso \code{\link{theme}} +#' @import ggplot2 +#' @export +#' +DarkTheme <- function(...) { + # Some constants for easier changing in the future + black.background <- element_rect(fill = 'black') + black.background.no.border <- element_rect(fill = 'black', size = 0) + font.margin <- 4 + white.text <- element_text( + colour = 'white', + margin = margin( + t = font.margin, + r = font.margin, + b = font.margin, + l = font.margin + ) + ) + white.line <- element_line(colour = 'white', size = 1) + no.line <- element_line(size = 0) + # Create the dark theme + dark.theme <- theme( + # Set background colors + plot.background = black.background, + panel.background = black.background, + legend.background = black.background, + legend.box.background = black.background.no.border, + legend.key = black.background.no.border, + # Set text colors + plot.title = white.text, + plot.subtitle = white.text, + axis.title = white.text, + axis.text = white.text, + legend.title = white.text, + legend.text = white.text, + # Set line colors + axis.line.x = white.line, + axis.line.y = white.line, + panel.grid = no.line, + panel.grid.minor = no.line, + # Make this a complete theme and validate it + complete = TRUE, + validate = TRUE, + # Extra parameters + ... + ) + return(dark.theme) +} + +#' Feature Locator +#' +#' Select points on a scatterplot and get information about them +#' +#' @param plot A ggplot2 plot +#' @param data.plot The oridinal data that went into the ggplot2 plot +#' @param ... Extra parameters, such as dark.theme, recolor, or smooth for using a dark theme, +#' recoloring based on selected cells, or using a smooth scatterplot, respectively +#' +#' @return The names of the points selected +#' +#' @seealso \code{\link{locator}} +#' @seealso \code{\link{ggplot2::ggplot_build}} +#' @export +#' +FeatureLocator <- function(plot, data.plot, ...) { + points.located <- PointLocator(plot = plot, ...) + # The rownames for points.located correspond to the row indecies + # of data.plot thanks to the way the ggplot object was made + selected <- data.plot[as.numeric(x = rownames(x = points.located)), ] + return(rownames(x = selected)) +} + +#' Hover Locator +#' +#' Get quick information from a scatterplot by hovering over points +#' +#' @param plot A ggplot2 plot +#' @param data.plot The oridinal data that went into the ggplot2 plot +#' @param features.info An optional dataframe or matrix of extra information to be displayed on hover +#' @param dark.theme Plot using a dark theme? +#' @param ... Extra parameters to be passed to plotly::layout +#' +#' @seealso \code{\link{plotly::layout}} +#' @seealso \code{\link{ggplot2::ggplot_build}} +#' @export +#' +HoverLocator <- function( + plot, + data.plot, + features.info = NULL, + dark.theme = FALSE, + ... +) { + # Use GGpointToBase because we already have ggplot objects + # with colors (which are annoying in plotly) + plot.build <- GGpointToBase(plot = plot, do.plot = FALSE) + rownames(x = plot.build) <- rownames(data.plot) + # Reset the names to 'x' and 'y' + names(x = plot.build) <- c( + 'x', + 'y', + names(x = plot.build)[3:length(x = plot.build)] + ) + # Add the names we're looking for (eg. cell name, gene name) + if (is.null(x = features.info)) { + plot.build$feature <- rownames(x = data.plot) + } else { + info <- apply( + X = features.info, + MARGIN = 1, + FUN = paste, + collapse = '
' + ) + data.info <- data.frame( + feature = paste(rownames(x = features.info), info, sep = '
'), + row.names = rownames(x = features.info) + ) + plot.build <- merge(x = plot.build, y = data.info, by = 0) + } + # Set up axis labels here + # Also, a bunch of stuff to get axis lines done properly + xaxis <- list( + title = names(x = data.plot)[1], + showgrid = FALSE, + zeroline = FALSE, + showline = TRUE + ) + yaxis <- list( + title = names(x = data.plot)[2], + showgrid = FALSE, + zeroline = FALSE, + showline = TRUE + ) + # Check for dark theme + if (dark.theme) { + title <- list(color = 'white') + xaxis <- c(xaxis, color = 'white') + yaxis <- c(yaxis, color = 'white') + plotbg <- 'black' + } else { + title = list(color = 'black') + plotbg = 'white' + } + # Start plotly and pipe it into layout for axis modifications + # The `~' means pull from the data passed (this is why we reset the names) + # Use I() to get plotly to accept the colors from the data as is + # Set hoverinfo to 'text' to override the default hover information + # rather than append to it + plotly::plot_ly( + data = plot.build, + x = ~x, + y = ~y, + type = 'scatter', + mode = 'markers', + color = ~I(color), + hoverinfo = 'text', + text = ~feature + ) %>% plotly::layout( + xaxis = xaxis, + yaxis = yaxis, + titlefont = title, + paper_bgcolor = plotbg, + plot_bgcolor = plotbg, + ... + ) +} + +#' Create a custom color palette +#' +#' Creates a custom color palette based on low, middle, and high color values +#' +#' @param low low color +#' @param high high color +#' @param mid middle color. Optional. +#' @param k number of steps (colors levels) to include between low and high values +#' +#' @export +#' +CustomPalette <- function( + low = "white", + high = "red", + mid = NULL, + k = 50 +) { + low <- col2rgb(col = low) / 255 + high <- col2rgb(col = high) / 255 + if (is.null(x = mid)) { + r <- seq(from = low[1], to = high[1], len = k) + g <- seq(from = low[2], to = high[2], len = k) + b <- seq(from = low[3], to = high[3], len = k) + } else { + k2 <- round(x = k / 2) + mid <- col2rgb(col = mid) / 255 + r <- c( + seq(from = low[1], to = mid[1], len = k2), + seq(from = mid[1], to = high[1], len = k2) + ) + g <- c( + seq(from = low[2], to = mid[2], len = k2), + seq(from = mid[2], to = high[2],len = k2) + ) + b <- c( + seq(from = low[3], to = mid[3], len = k2), + seq(from = mid[3], to = high[3], len = k2) + ) + } + return(rgb(red = r, green = g, blue = b)) +} + +#shortcut to make black-white palette +#' @export +bwCols <- CustomPalette(low = "white", high="black", k = 50) + +#' @export +#shortcut to make purple-yellow palette, which is default in most Seurat heatmaps +pyCols <- CustomPalette(low = "magenta", high = "yellow", mid = "black") diff --git a/R/preprocessing.R b/R/preprocessing.R new file mode 100644 index 000000000..ebf3eb14f --- /dev/null +++ b/R/preprocessing.R @@ -0,0 +1,733 @@ +#' Initialize and setup the Seurat object +#' +#' Initializes the Seurat object and some optional filtering +#' @param raw.data Raw input data +#' @param project Project name (string) +#' @param min.cells Include genes with detected expression in at least this +#' many cells. Will subset the raw.data matrix as well. To reintroduce excluded +#' genes, create a new object with a lower cutoff. +#' @param min.genes Include cells where at least this many genes are detected. +#' @param is.expr Expression threshold for 'detected' gene. For most datasets, particularly UMI +#' datasets, will be set to 0 (default). If not, when initializing, this should be set to a level +#' based on pre-normalized counts (i.e. require at least 5 counts to be treated as expresesd) All +#' values less than this will be set to 0 (though maintained in object@raw.data). +#' @param normalization.method Method for cell normalization. Default is no normalization. +#' In this case, run NormalizeData later in the workflow. As a shortcut, you can specify a +#' normalization method (i.e. LogNormalize) here directly. +#' @param scale.factor If normalizing on the cell level, this sets the scale factor. +#' @param do.scale In object@@scale.data, perform row-scaling (gene-based +#' z-score). FALSE by default. In this case, run ScaleData later in the workflow. As a shortcut, you +#' can specify do.scale=T (and do.center=T) here. +#' @param do.center In object@@scale.data, perform row-centering (gene-based centering) +#' @param names.field For the initial identity class for each cell, choose this field from the +#' cell's column name +#' @param names.delim For the initial identity class for each cell, choose this delimiter from the +#' cell's column name +#' @param meta.data Additional metadata to add to the Seurat object. Should be a data frame where +#' the rows are cell names, and the columns are additional metadata fields +#' @param save.raw TRUE by default. If FALSE, do not save the unmodified data in object@@raw.data +#' which will save memory downstream for large datasets +#' +#' @return Returns a Seurat object with the raw data stored in object@@raw.data. +#' object@@data, object@@meta.data, object@@ident, also initialized. +#' +#' @import stringr +#' @import pbapply +#' @importFrom Matrix colSums rowSums +#' +#' @export +#' +CreateSeuratObject <- function( + raw.data, + project = "SeuratProject", + min.cells = 0, + min.genes = 0, + is.expr = 0, + normalization.method = NULL, + scale.factor = 1e4, + do.scale = FALSE, + do.center = FALSE, + names.field = 1, + names.delim = "_", + meta.data = NULL, + save.raw = TRUE +) { + seurat.version <- packageVersion("Seurat") + object <- new(Class = "seurat", + raw.data = raw.data, + is.expr = is.expr, + project.name = project, + version = seurat.version + ) + + # filter cells on number of genes detected + # modifies the raw.data slot as well now + + object.raw.data=object@raw.data + if (is.expr > 0) object.raw.data[object.raw.data < is.expr] = 0 + num.genes <- colSums(object.raw.data > is.expr) + num.mol <- colSums(object.raw.data) + cells.use <- names(num.genes[which(num.genes > min.genes)]) + object@raw.data <- object@raw.data[, cells.use] + object@data <- object.raw.data[, cells.use] + # to save memory downstream, especially for large objects if raw.data no + # longer needed + if (!(save.raw)) { + object@raw.data <- matrix() + } + # filter genes on the number of cells expressing + # modifies the raw.data slot as well now + genes.use <- rownames(object@data) + if (min.cells > 0) { + num.cells <- rowSums(object@data > 0) + genes.use <- names(num.cells[which(num.cells >= min.cells)]) + object@raw.data <- object@raw.data[genes.use, ] + object@data <- object@data[genes.use, ] + } + object@ident <- factor( + x = unlist( + x = lapply( + X = colnames(x = object@data), + FUN = ExtractField, + field = names.field, + delim = names.delim + ) + ) + ) + names(x = object@ident) <- colnames(x = object@data) + object@cell.names <- names(x = object@ident) + # if there are more than 100 idents, set all idents to project name + ident.levels <- length(x = unique(x = object@ident)) + if ((ident.levels > 100 || ident.levels == 0) || ident.levels == length(x = object@ident)) { + object <- SetIdent(object, ident.use = project) + } + nGene <- num.genes[cells.use] + nUMI <- num.mol[cells.use] + object@meta.data <- data.frame(nGene, nUMI) + if (! is.null(x = meta.data)) { + object <- AddMetaData(object = object, metadata = meta.data) + } + object@meta.data[names(object@ident), "orig.ident"] <- object@ident + if (!is.null(normalization.method)) { + object <- NormalizeData(object = object, + assay.type = "RNA", + normalization.method = normalization.method, + scale.factor = scale.factor) + } + if(do.scale | do.center) { + object <- ScaleData(object = object, + do.scale = do.scale, + do.center = do.center) + } + spatial.obj <- new( + Class = "spatial.info", + mix.probs = data.frame(nGene) + ) + object@spatial <- spatial.obj + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("CreateSeuratObject"))] + parameters.to.store$raw.data <- NULL + parameters.to.store$meta.data <- NULL + object <- SetCalcParams(object = object, + calculation = "CreateSeuratObject", + ... = parameters.to.store) + + return(object) +} + +#' Load in data from 10X +#' +#' Enables easy loading of sparse data matrices provided by 10X genomics. +#' +#' @param data.dir Directory containing the matrix.mtx, genes.tsv, and barcodes.tsv +#' files provided by 10X. A vector or named vector can be given in order to load +#' several data directories. If a named vector is given, the cell barcode names +#' will be prefixed with the name. +#' +#' @return Returns a sparse matrix with rows and columns labeled +#' +#' @importFrom Matrix readMM +#' +#' @export +#' +Read10X <- function(data.dir = NULL){ + full.data <- list() + for (i in seq_along(data.dir)) { + run <- data.dir[i] + if (! dir.exists(run)){ + stop("Directory provided does not exist") + } + if(!grepl("\\/$", run)){ + run <- paste(run, "/", sep = "") + } + barcode.loc <- paste0(run, "barcodes.tsv") + gene.loc <- paste0(run, "genes.tsv") + matrix.loc <- paste0(run, "matrix.mtx") + if (!file.exists(barcode.loc)){ + stop("Barcode file missing") + } + if (! file.exists(gene.loc)){ + stop("Gene name file missing") + } + if (! file.exists(matrix.loc)){ + stop("Expression matrix file missing") + } + data <- readMM(file = matrix.loc) + cell.names <- readLines(barcode.loc) + gene.names <- readLines(gene.loc) + if (all(grepl(pattern = "\\-1$", x = cell.names))) { + cell.names <- as.vector( + x = as.character( + x = sapply( + X = cell.names, + FUN = ExtractField, field = 1, + delim = "-" + ) + ) + ) + } + rownames(x = data) <- make.unique( + names = as.character( + x = sapply( + X = gene.names, + FUN = ExtractField, + field = 2, + delim = "\\t" + ) + ) + ) + if (is.null(x = names(x = data.dir))) { + if(i < 2){ + colnames(x = data) <- cell.names + } + else { + colnames(x = data) <- paste0(i, "_", cell.names) + } + } else { + colnames(x = data) <- paste0(names(x = data.dir)[i], "_", cell.names) + } + full.data <- append(x = full.data, values = data) + } + full.data <- do.call(cbind, full.data) + return(full.data) +} + +#' Normalize Assay Data +#' +#' Normalize data for a given assay +#' +#' @param object Seurat object +#' @param assay.type Type of assay to normalize for (default is RNA), but can be +#' changed for multimodal analyses. +#' @param normalization.method Method for normalization. Default is +#' log-normalization (LogNormalize). More methods to be added very shortly. +#' @param scale.factor Sets the scale factor for cell-level normalization +#' @param display.progress display progress bar for scaling procedure. +#' +#' @return Returns object after normalization. Normalized data is stored in data +#' or scale.data slot, depending on the method +#' +#' @export +#' +NormalizeData <- function( + object, + assay.type = "RNA", + normalization.method = "LogNormalize", + scale.factor = 1e4, + display.progress = TRUE +) { + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("NormalizeData"))] + object <- SetCalcParams(object = object, + calculation = "NormalizeData", + ... = parameters.to.store) + if(is.null(normalization.method)) { + return(object) + } + if (normalization.method == "LogNormalize") { + raw.data <- GetAssayData( + object = object, + assay.type = assay.type, + slot = "raw.data" + ) + if (is.null(x = raw.data)) { + stop(paste("Raw data for", assay.type, "has not been set")) + } + normalized.data <- LogNormalize( + data = raw.data, + scale.factor = scale.factor, + display.progress = display.progress + ) + object <- SetAssayData( + object = object, + assay.type = assay.type, + slot = "data", + new.data = normalized.data + ) + } + return(object) +} + +#' Old R based implementation of ScaleData. Scales and centers the data +#' +#' @param object Seurat object +#' @param genes.use Vector of gene names to scale/center. Default is all genes in object@@data. +#' @param data.use Can optionally pass a matrix of data to scale, default is object@data[genes.use,] +#' @param do.scale Whether to scale the data. +#' @param do.center Whether to center the data. +#' @param scale.max Max value to accept for scaled data. The default is 10. Setting this can help +#' reduce the effects of genes that are only expressed in a very small number of cells. +#' +#' @return Returns a seurat object with object@@scale.data updated with scaled and/or centered data. +#' +#' @export +#' +ScaleDataR <- function( + object, + genes.use = NULL, + data.use = NULL, + do.scale = TRUE, + do.center = TRUE, + scale.max = 10 +) { + genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data)) + genes.use <- intersect(x = genes.use, y = rownames(x = object@data)) + data.use <- SetIfNull(x = data.use, default = object@data[genes.use, ]) + object@scale.data <- matrix( + data = NA, + nrow = length(x = genes.use), + ncol = ncol(x = object@data) + ) + #rownames(object@scale.data) <- genes.use + #colnames(object@scale.data) <- colnames(object@data) + dimnames(x = object@scale.data) <- dimnames(x = data.use) + if (do.scale | do.center) { + bin.size <- 1000 + max.bin <- floor(length(genes.use)/bin.size) + 1 + print("Scaling data matrix") + pb <- txtProgressBar(min = 0, max = max.bin, style = 3) + for (i in 1:max.bin) { + my.inds <- ((bin.size * (i - 1)):(bin.size * i - 1)) + 1 + my.inds <- my.inds[my.inds <= length(x = genes.use)] + #print(my.inds) + new.data <- t( + x = scale( + x = t(x = as.matrix(x = data.use[genes.use[my.inds], ])), + center = do.center, + scale = do.scale + ) + ) + new.data[new.data > scale.max] <- scale.max + object@scale.data[genes.use[my.inds], ] <- new.data + setTxtProgressBar(pb, i) + } + close(pb) + } + return(object) +} + + +#' Scale and center the data. +#' +#' Scales and centers the data. If latent variables are provided (latent.vars), their effects are +#' removed through regression and the resulting residuals are then scaled and centered. +#' +#' +#' @param object Seurat object +#' @param genes.use Vector of gene names to scale/center. Default is all genes +#' in object@@data. +#' @param data.use Can optionally pass a matrix of data to scale, default is +#' object@data[genes.use, ] +#' @param latent.vars effects to regress out +#' @param model.use Use a linear model or generalized linear model +#' (poisson, negative binomial) for the regression. Options are 'linear' +#' (default), 'poisson', and 'negbinom' +#' @param use.umi Regress on UMI count data. Default is FALSE for linear +#' modeling, but automatically set to TRUE if model.use is 'negbinom' or 'poisson' +#' @param do.scale Whether to scale the data. +#' @param do.center Whether to center the data. +#' @param scale.max Max value to return for scaled data. The default is 10. +#' Setting this can help reduce the effects of genes that are only expressed in +#' a very small number of cells. +#' @param block.size Default size for number of genes to scale at in a single +#' computation. Increasing block.size may speed up calculations but at an +#' additional memory cost. +#' @param min.cells.to.block If object contains fewer than this number of cells, +#' don't block for scaling calculations. +#' @param display.progress Displays a progress bar for scaling procedure +#' @param assay.type Assay to scale data for. Default is RNA. Can be changed for +#' multimodal analyses. +#' @param do.cpp By default (TRUE), most of the heavy lifting is done in c++. +#' We've maintained support for our previous implementation in R for +#' reproducibility (set this to FALSE) as results can change slightly due to +#' differences in numerical precision which could affect downstream calculations. +#' +#' @return Returns a seurat object with object@@scale.data updated with scaled +#' and/or centered data. +#' +#' @export +#' +ScaleData <- function( + object, + genes.use = NULL, + data.use = NULL, + latent.vars, + model.use = 'linear', + use.umi = FALSE, + do.scale = TRUE, + do.center = TRUE, + scale.max = 10, + block.size = 1000, + min.cells.to.block = 3000, + display.progress = TRUE, + assay.type = "RNA", + do.cpp = TRUE +) { + data.use <- SetIfNull(x = data.use, default = GetAssayData(object = object, + assay.type = assay.type, + slot = "data")) + + if (!("NormalizeData" %in% names(object@calc.params))) { + cat("NormalizeData has not been run, therefore ScaleData is running on non-normalized values. Recommended workflow is to run NormalizeData first.\n") + } + genes.use <- SetIfNull(x = genes.use, default = rownames(x = data.use)) + genes.use <- as.vector( + x = intersect( + x = genes.use, + y = rownames(x = data.use) + ) + ) + data.use <- data.use[genes.use, ] + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("ScaleData"))] + parameters.to.store$data.use <- NULL + object <- SetCalcParams(object = object, + calculation = "ScaleData", + ... = parameters.to.store) + if(!missing(latent.vars)){ + data.use <- RegressOut(object = object, + latent.vars = latent.vars, + genes.regress = genes.use, + use.umi = use.umi, + model.use = model.use) + } + if(!do.cpp){ + return(ScaleDataR(object = object, + data.use = data.use, + do.scale = do.scale, + do.center = do.center, + scale.max = scale.max, + genes.use = genes.use)) + } + scaled.data <- matrix(data = NA, + nrow = length(x = genes.use), + ncol = ncol(x = object@data) + ) + rownames(scaled.data) <- genes.use + if(length(object@cell.names) <= min.cells.to.block) { + block.size <- length(genes.use) + } + gc() + colnames(scaled.data) <- colnames(object@data) + max.block <- ceiling(length(genes.use)/block.size) + gc() + print("Scaling data matrix") + pb <- txtProgressBar(min = 0, max = max.block, style = 3) + for (i in 1:max.block) { + my.inds <- ((block.size * (i - 1)):(block.size * i - 1)) + 1 + my.inds <- my.inds[my.inds <= length(x = genes.use)] + if (class(data.use) == "dgCMatrix" | class(data.use) == "dgTMatrix") { + data.scale <- FastSparseRowScale( + mat = data.use[genes.use[my.inds], , drop = F], + scale = do.scale, + center = do.center, + scale_max = scale.max, + display_progress = FALSE + ) + } else { + data.scale <- FastRowScale( + mat = as.matrix(data.use[genes.use[my.inds], , drop = F]), + scale = do.scale, + center = do.center, + scale_max = scale.max, + display_progress = FALSE + ) + } + dimnames(x = data.scale) <- dimnames(x = data.use[genes.use[my.inds], ]) + scaled.data[genes.use[my.inds], ] <- data.scale + rm(data.scale) + gc() + setTxtProgressBar(pb, i) + } + close(pb) + object <- SetAssayData( + object = object, + assay.type = assay.type, + slot = 'scale.data', + new.data = scaled.data + ) + gc() + return(object) +} + +#' Normalize raw data +#' +#' Normalize count data per cell and transform to log scale +#' +#' @param data Matrix with the raw count data +#' @param scale.factor Scale the data. Default is 1e4 +#' @param display.progress Print progress +#' +#' @return Returns a matrix with the normalize and log transformed data +#' +#' @import Matrix +#' +#' @export +#' +LogNormalize <- function(data, scale.factor = 1e4, display.progress = TRUE) { + if (class(x = data) == "data.frame") { + data <- as.matrix(x = data) + } + if (class(x = data) != "dgCMatrix") { + data <- as(object = data, Class = "dgCMatrix") + } + # call Rcpp function to normalize + if (display.progress) { + print("Performing log-normalization") + } + norm.data <- LogNorm(data, scale_factor = scale.factor, display_progress = display.progress) + colnames(x = norm.data) <- colnames(x = data) + rownames(x = norm.data) <- rownames(x = data) + return(norm.data) +} + +#' Sample UMI +#' +#' Downsample each cell to a specified number of UMIs. Includes +#' an option to upsample cells below specified UMI as well. +#' +#' @param data Matrix with the raw count data +#' @param max.umi Number of UMIs to sample to +#' @param upsample Upsamples all cells with fewer than max.umi +#' @param progress.bar Display the progress bar +#' +#' @import Matrix +#' +#' @return Matrix with downsampled data +#' +#' @export +#' +SampleUMI <- function( + data, + max.umi = 1000, + upsample = FALSE, + progress.bar = TRUE +) { + data <- as(data, "dgCMatrix") + if (length(x = max.umi) == 1) { + return( + RunUMISampling( + data = data, + sample_val = max.umi, + upsample = upsample, + display_progress = progress.bar + ) + ) + } else if (length(x = max.umi) != ncol(x = data)) { + stop("max.umi vector not equal to number of cells") + } + return( + RunUMISamplingPerCell( + data = data, + sample_val = max.umi, + upsample = upsample, + display_progress = progress.bar + ) + ) +} + +#' Identify variable genes +#' +#' Identifies genes that are outliers on a 'mean variability plot'. First, uses +#' a function to calculate average expression (mean.function) and dispersion (dispersion.function) +#' for each gene. Next, divides genes into num.bin (deafult 20) bins based on +#' their average expression, and calculates z-scores for dispersion within each +#' bin. The purpose of this is to identify variable genes while controlling for +#' the strong relationship between variability and average expression. +#' +#' Exact parameter settings may vary empirically from dataset to dataset, and +#' based on visual inspection of the plot. +#' Setting the y.cutoff parameter to 2 identifies genes that are more than two standard +#' deviations away from the average dispersion within a bin. The default X-axis function +#' is the mean expression level, and for Y-axis it is the log(Variance/mean). All mean/variance +#' calculations are not performed in log-space, but the results are reported in log-space - +#' see relevant functions for exact details. +#' +#' @param object Seurat object +#' @param mean.function Function to compute x-axis value (average expression). Default +#' is to take the mean of the detected (i.e. non-zero) values +#' @param dispersion.function Function to compute y-axis value (dispersion). Default is to +#' take the standard deviation of all values/ +#' @param do.plot Plot the average/dispersion relationship +#' @param set.var.genes Set object@@var.genes to the identified variable genes +#' (default is TRUE) +#' @param x.low.cutoff Bottom cutoff on x-axis for identifying variable genes +#' @param x.high.cutoff Top cutoff on x-axis for identifying variable genes +#' @param y.cutoff Bottom cutoff on y-axis for identifying variable genes +#' @param y.high.cutoff Top cutoff on y-axis for identifying variable genes +#' @param num.bin Total number of bins to use in the scaled analysis (default +#' is 20) +#' @param do.recalc TRUE by default. If FALSE, plots and selects variable genes without recalculating statistics for each gene. +#' @param sort.results If TRUE (by default), sort results in object@hvg.info in decreasing order of dispersion +#' @param ... Extra parameters to VariableGenePlot +#' @inheritParams VariableGenePlot +#' +#' @importFrom MASS kde2d +#' +#' @return Returns a Seurat object, placing variable genes in object@@var.genes. +#' The result of all analysis is stored in object@@hvg.info +#' +#' @seealso \code{\link{VariableGenePlot}} +#' +#' @export +#' +FindVariableGenes <- function( + object, + mean.function = ExpMean, + dispersion.function = LogVMR, + do.plot = TRUE, + set.var.genes = TRUE, + x.low.cutoff = 0.1, + x.high.cutoff = 8, + y.cutoff = 1, + y.high.cutoff = Inf, + num.bin = 20, + do.recalc = TRUE, + sort.results = TRUE, + ... +) { + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("FindVariableGenes"))] + parameters.to.store$mean.function <- as.character(substitute(mean.function)) + parameters.to.store$dispersion.function <- as.character(substitute(dispersion.function)) + object <- SetCalcParams(object = object, + calculation = "FindVariableGenes", + ... = parameters.to.store) + data <- object@data + if (do.recalc) { + genes.use <- rownames(x = object@data) + gene.mean <- rep(x = 0, length(x = genes.use)) + names(x = gene.mean) <- genes.use + gene.dispersion <- gene.mean + gene.dispersion.scaled <- gene.mean + bin.size <- 1000 + max.bin <- floor(x = length(x = genes.use) / bin.size) + 1 + print("Calculating gene dispersion") + pb <- txtProgressBar(min = 0, max = max.bin, style = 3) + for (i in 1:max.bin) { + my.inds <- ((bin.size * (i - 1)):(bin.size * i - 1)) + 1 + my.inds <- my.inds[my.inds <= length(x = genes.use)] + genes.iter <- genes.use[my.inds] + data.iter <- data[genes.iter, , drop = F] + gene.mean[genes.iter] <- apply(X = data.iter, MARGIN = 1, FUN = mean.function) + gene.dispersion[genes.iter] <- apply(X = data.iter, MARGIN = 1, FUN = dispersion.function) + setTxtProgressBar(pb = pb, value = i) + } + close(con = pb) + gene.dispersion[is.na(x = gene.dispersion)] <- 0 + gene.mean[is.na(x = gene.mean)] <- 0 + data_x_bin <- cut(x = gene.mean, breaks = num.bin) + names(x = data_x_bin) <- names(x = gene.mean) + mean_y <- tapply(X = gene.dispersion, INDEX = data_x_bin, FUN = mean) + sd_y <- tapply(X = gene.dispersion, INDEX = data_x_bin, FUN = sd) + gene.dispersion.scaled <- (gene.dispersion - mean_y[as.numeric(x = data_x_bin)]) / + sd_y[as.numeric(x = data_x_bin)] + gene.dispersion.scaled[is.na(x = gene.dispersion.scaled)] <- 0 + names(x = gene.dispersion.scaled) <- names(x = gene.mean) + mv.df <- data.frame(gene.mean, gene.dispersion, gene.dispersion.scaled) + rownames(x = mv.df) <- rownames(x = data) + object@hvg.info <- mv.df + } + gene.mean <- object@hvg.info[, 1] + gene.dispersion <- object@hvg.info[, 2] + gene.dispersion.scaled <- object@hvg.info[, 3] + names(x = gene.mean) <- names(x = gene.dispersion) <- names(x = gene.dispersion.scaled) <- rownames(x = object@data) + pass.cutoff <- names(x = gene.mean)[which( + x = ( + (gene.mean > x.low.cutoff) & (gene.mean < x.high.cutoff) + ) & + (gene.dispersion.scaled > y.cutoff) & + (gene.dispersion.scaled < y.high.cutoff) + )] + if (do.plot) { + VariableGenePlot( + object = object, + ... + ) + } + if (set.var.genes) { + object@var.genes <- pass.cutoff + if (sort.results) { + object@hvg.info <- object@hvg.info[order( + object@hvg.info$gene.dispersion, + decreasing = TRUE + ),] + } + return(object) + } else { + return(pass.cutoff) + } +} + +#' Return a subset of the Seurat object +#' +#' Creates a Seurat object containing only a subset of the cells in the +#' original object. Takes either a list of cells to use as a subset, or a +#' parameter (for example, a gene), to subset on. +#' +#' @param object Seurat object +#' @param subset.names Parameters to subset on. Eg, the name of a gene, PC1, a +#' column name in object@@meta.data, etc. Any argument that can be retreived +#' using FetchData +#' @param low.thresholds Low cutoffs for the parameters (default is -Inf) +#' @param high.thresholds High cutoffs for the parameters (default is Inf) +#' @param cells.use A vector of cell names to use as a subset +#' +#' @return Returns a Seurat object containing only the relevant subset of cells +#' +#' @export +#' +FilterCells <- function( + object, + subset.names, + low.thresholds, + high.thresholds, + cells.use = NULL +) { + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("FilterCells"))] + object <- SetCalcParams(object = object, + calculation = "FilterCells", + ... = parameters.to.store) + if (missing(x = low.thresholds)) { + low.thresholds <- replicate(n = length(x = subset.names), expr = -Inf) + } + if (missing(x = high.thresholds)) { + high.thresholds <- replicate(n = length(x = subset.names), expr = Inf) + } + length.check <- sapply( + X = list(subset.names, low.thresholds, high.thresholds), + FUN = length + ) + if (length(x = unique(x = length.check)) != 1) { + stop("'subset.names', 'low.thresholds', and 'high.thresholds' must all have the same length") + } + data.subsets <- data.frame(subset.names, low.thresholds, high.thresholds) + cells.use <- SetIfNull(x = cells.use, default = object@cell.names) + for (i in seq(nrow(data.subsets))) { + cells.use <- WhichCells( + object = object, + cells.use = cells.use, + subset.name = data.subsets[i, 1], + accept.low = data.subsets[i, 2], + accept.high = data.subsets[i, 3] + ) + } + object <- SubsetData(object, cells.use = cells.use) + return(object) +} diff --git a/R/preprocessing_internal.R b/R/preprocessing_internal.R new file mode 100644 index 000000000..1d7c4316d --- /dev/null +++ b/R/preprocessing_internal.R @@ -0,0 +1,304 @@ +#' Regress out technical effects and cell cycle +#' +#' Remove unwanted effects from scale.data +#' +#' @keywords internal +#' @param object Seurat object +#' @param latent.vars effects to regress out +#' @param genes.regress gene to run regression for (default is all genes) +#' @param model.use Use a linear model or generalized linear model (poisson, negative binomial) for the regression. Options are 'linear' (default), 'poisson', and 'negbinom' +#' @param use.umi Regress on UMI count data. Default is FALSE for linear modeling, but automatically set to TRUE if model.use is 'negbinom' or 'poisson' +#' +#' @return Returns the residuals from the regression model +#' +#' @import Matrix +#' +RegressOut <- function( + object, + latent.vars, + genes.regress = NULL, + model.use = 'linear', + use.umi = FALSE +) { + possible.models <- c("linear", "poisson", "negbinom") + if (! model.use %in% possible.models){ + stop( + paste0( + model.use, + " is not a valid model. Please use one the following: ", + paste0(possible.models, collapse = ", "), + "." + ) + ) + } + genes.regress <- SetIfNull(x = genes.regress, default = rownames(x = object@data)) + genes.regress <- intersect(x = genes.regress, y = rownames(x = object@data)) + latent.data <- FetchData(object = object, vars.all = latent.vars) + bin.size <- 100 + if (model.use == 'negbinom') { + bin.size <- 5 + } + bin.ind <- ceiling(x = 1:length(x = genes.regress) / bin.size) + max.bin <- max(bin.ind) + print(paste("Regressing out", latent.vars)) + pb <- txtProgressBar(min = 0, max = max.bin, style = 3) + data.resid <- c() + data.use <- object@data[genes.regress, , drop = FALSE]; + if (model.use != "linear") { + use.umi <- TRUE + } + if (use.umi) { + data.use <- object@raw.data[genes.regress, object@cell.names, drop = FALSE] + } + for (i in 1:max.bin) { + genes.bin.regress <- rownames(x = data.use)[bin.ind == i] + gene.expr <- as.matrix(x = data.use[genes.bin.regress, , drop = FALSE]) + new.data <- do.call( + rbind, + lapply( + X = genes.bin.regress, + FUN = function(x) { + regression.mat <- cbind(latent.data, gene.expr[x,]) + colnames(x = regression.mat) <- c(colnames(x = latent.data), "GENE") + fmla <- as.formula( + object = paste0( + "GENE ", + " ~ ", + paste(latent.vars, collapse = "+") + ) + ) + if (model.use == 'linear') { + return(lm(formula = fmla, data = regression.mat)$residuals) + } + if (model.use == 'poisson') { + return(residuals( + object = glm( + formula = fmla, + data = regression.mat, + family = "poisson" + ), + type='pearson' + )) + } + if (model.use == 'negbinom') { + return(NBResiduals( + fmla = fmla, + regression.mat = regression.mat, + gene = x + )) + } + } + ) + ) + if (i == 1) { + data.resid=new.data + } + if (i > 1) { + data.resid=rbind(data.resid,new.data) + } + setTxtProgressBar(pb, i) + } + close(pb) + rownames(x = data.resid) <- genes.regress + if (use.umi) { + data.resid <- log1p( + x = sweep( + x = data.resid, + MARGIN = 1, + STATS = apply(X = data.resid, MARGIN = 1, FUN = min), + FUN = "-" + ) + ) + } + return(data.resid) +} + +# Regress out technical effects and cell cycle using regularized Negative Binomial regression +# +# Remove unwanted effects from umi data and set scale.data to Pearson residuals +# Uses mclapply; you can set the number of cores it will use to n with command options(mc.cores = n) +# +# @param object Seurat object +# @param latent.vars effects to regress out +# @param genes.regress gene to run regression for (default is all genes) +# @param pr.clip.range numeric of length two specifying the min and max values the results will be clipped to +# +# @return Returns Seurat object with the scale.data (object@scale.data) genes returning the residuals fromthe regression model +# +#' @import Matrix +#' @importFrom MASS theta.ml negative.binomial +#' @import parallel +# +RegressOutNB <- function( + object, + latent.vars, + genes.regress = NULL, + pr.clip.range = c(-30, 30), + min.theta = 0.01 +) { + genes.regress <- SetIfNull(x = genes.regress, default = rownames(x = object@data)) + genes.regress <- intersect(x = genes.regress, y = rownames(x = object@data)) + cm <- object@raw.data[genes.regress, colnames(x = object@data), drop = FALSE] + latent.data <- FetchData(object = object, vars.all = latent.vars) + cat(sprintf('Regressing out %s for %d genes\n', paste(latent.vars), length(x = genes.regress))) + theta.fit <- RegularizedTheta(cm = cm, latent.data = latent.data, min.theta = 0.01, bin.size = 128) + print('Second run NB regression with fixed theta') + bin.size <- 128 + bin.ind <- ceiling(1:length(genes.regress)/bin.size) + max.bin <- max(bin.ind) + pb <- txtProgressBar(min = 0, max = max.bin, style = 3) + pr <- c() + for (i in 1:max.bin) { + genes.bin.regress <- genes.regress[bin.ind == i] + bin.pr.lst <- parallel::mclapply( + X = genes.bin.regress, + FUN = function(j) { + fit <- 0 + try( + expr = fit <- glm( + cm[j, ] ~ ., + data = latent.data, + family = MASS::negative.binomial(theta = theta.fit[j]) + ), + silent=TRUE + ) + if (class(fit)[1] == 'numeric') { + message( + sprintf( + 'glm and family=negative.binomial(theta=%f) failed for gene %s; falling back to scale(log10(y+1))', + theta.fit[j], + j + ) + ) + res <- scale(log10(cm[j, ] + 1))[, 1] + } else { + res <- residuals(fit, type = 'pearson') + } + return(res) + } + ) + pr <- rbind(pr, do.call(rbind, bin.pr.lst)) + setTxtProgressBar(pb, i) + } + close(pb) + dimnames(x = pr) <- dimnames(x = cm) + pr[pr < pr.clip.range[1]] <- pr.clip.range[1] + pr[pr > pr.clip.range[2]] <- pr.clip.range[2] + object@scale.data <- pr + return(object) +} + +# Regress out technical effects and cell cycle using regularized Negative Binomial regression +# +# Remove unwanted effects from umi data and set scale.data to Pearson residuals +# Uses mclapply; you can set the number of cores it will use to n with command options(mc.cores = n) +# +# @param object Seurat object +# @param latent.vars effects to regress out +# @param genes.regress gene to run regression for (default is all genes) +# @param pr.clip.range numeric of length two specifying the min and max values the results will be clipped to +# +# @return Returns Seurat object with the scale.data (object@scale.data) genes returning the residuals from the regression model +# +#' @import Matrix +#' @importFrom MASS theta.ml negative.binomial +#' @import parallel +# +RegressOutNBreg <- function( + object, + latent.vars, + genes.regress = NULL, + pr.clip.range = c(-30, 30), + min.theta = 0.01 +) { + genes.regress <- SetIfNull(x = genes.regress, default = rownames(x = object@data)) + genes.regress <- intersect(x = genes.regress, y = rownames(x = object@data)) + cm <- object@raw.data[genes.regress, colnames(x = object@data), drop=FALSE] + latent.data <- FetchData(boject = object, vars.all = latent.vars) + bin.size <- 128 + bin.ind <- ceiling(x = 1:length(x = genes.regress) / bin.size) + max.bin <- max(bin.ind) + print(paste("Regressing out", latent.vars)) + print('First run Poisson regression (to get initial mean), and estimate theta per gene') + pb <- txtProgressBar(min = 0, max = max.bin, style = 3) + theta.estimate <- c() + for (i in 1:max.bin) { + genes.bin.regress <- genes.regress[bin.ind == i] + bin.theta.estimate <- unlist( + parallel::mclapply( + X = genes.bin.regress, + FUN = function(j) { + as.numeric( + x = MASS::theta.ml( + cm[j, ], + glm(cm[j, ] ~ ., data = latent.data, family=poisson)$fitted + ) + ) + } + ), + use.names = FALSE + ) + theta.estimate <- c(theta.estimate, bin.theta.estimate) + setTxtProgressBar(pb, i) + } + close(pb) + UMI.mean <- apply(X = cm, MARGIN = 1, FUN = mean) + var.estimate <- UMI.mean + (UMI.mean ^ 2) / theta.estimate + fit <- loess(log10(var.estimate) ~ log10(UMI.mean), span = 0.33) + theta.fit <- (UMI.mean ^ 2) / (10 ^ fit$fitted - UMI.mean) + names(x = theta.fit) <- genes.regress + to.fix <- theta.fit <= min.theta | is.infinite(x = theta.fit) + if (any(to.fix)) { + cat( + 'Fitted theta below', + min.theta, + 'for', + sum(to.fix), + 'genes, setting them to', + min.theta, + '\n' + ) + theta.fit[to.fix] <- min.theta + } + print('Second run NB regression with fixed theta') + pb <- txtProgressBar(min = 0, max = max.bin, style = 3) + pr <- c() + for(i in 1:max.bin) { + genes.bin.regress <- genes.regress[bin.ind == i] + bin.pr.lst <- parallel::mclapply( + X = genes.bin.regress, + FUN = function(j) { + fit <- 0 + try( + fit <- glm( + cm[j, ] ~ ., + data = latent.data, + family=MASS::negative.binomial(theta = theta.fit[j]) + ), + silent=TRUE + ) + if (class(fit)[1] == 'numeric') { + message( + sprintf( + 'glm and family=negative.binomial(theta=%f) failed for gene %s; falling back to scale(log10(y+1))', + theta.fit[j], + j + ) + ) + res <- scale(x = log10(cm[j, ] + 1))[, 1] + } else { + res <- residuals(object = fit, type='pearson') + } + return(res) + } + ) + pr <- rbind(pr, do.call(rbind, bin.pr.lst)) + setTxtProgressBar(pb, i) + } + close(pb) + dimnames(pr) <- dimnames(cm) + pr[pr < pr.clip.range[1]] <- pr.clip.range[1] + pr[pr > pr.clip.range[2]] <- pr.clip.range[2] + object@scale.data <- r + return(object) +} diff --git a/R/printing_utilities.R b/R/printing_utilities.R new file mode 100644 index 000000000..22088ab0c --- /dev/null +++ b/R/printing_utilities.R @@ -0,0 +1,710 @@ +#' Print the calculation +#' +#' Print entire contents of calculation settings slot (calc.params) for given +#' calculation. +#' +#' @param object Seurat object +#' @param calculation Name of calculation (function name) to check parameters +#' for +#' @param raw Print the entire contents of the calculation settings slot (calc.params) +#' for the RunPCA calculation. +#' @param return.list Return the calculation parameters as a list +#' @return Prints the calculation settings and optionally returns them as a list +#' @export +PrintCalcParams <- function(object, calculation, raw = FALSE, + return.list = FALSE) { + if(is.null(object@calc.params[[calculation]])){ + stop(paste0(calculation, " not computed yet.")) + } + if (!raw){ + if(calculation == "RunPCA"){ + PrintPCAParams(object) + } + if(calculation == "ICA"){ + PrintICAParams(object) + } + if(calculation == "RunTSNE"){ + PrintTSNEParams(object) + } + if(calculation == "RunCCA"){ + PrintCCAParams(object) + } + if(calculation == "CalcVarExpRatioParams"){ + PrintCalcVarExpRatioParams(object) + } + if(calculation == "AlignSubspace"){ + PrintAlignSubspaceParams(object) + } + if(calculation == "RunDiffusion"){ + PrintDMParams(object) + } + if(calculation == "BuildSNN"){ + PrintSNNParams(object) + } + if(calculation == "FindClusters"){ + PrintFindClustersParams(object) + } + } + print(object@calc.params[[calculation]]) + if(return.list){ + return(object@calc.params[[calculation]]) + } +} + +#' Print PCA Calculation Parameters +#' +#' Print the parameters chosen for the latest stored PCA calculation. +#' +#' @param object Seurat object +#' @param raw Print the entire contents of the calculation settings slot +#' (calc.params) for the RunPCA calculation. Default (FALSE) will print a nicely +#' formatted summary. +#' @return No return value. Only prints to console. +#' @export +PrintPCAParams <- function(object, raw = FALSE){ + if(is.null(object@calc.params$RunPCA)){ + stop("PCA has not been computed yet") + } + if (raw){ + print(object@calc.params$RunPCA) + } + else{ + cat(paste0("Parameters used in latest PCA calculation run on: ", + GetCalcParam(object = object, + calculation = "RunPCA", + parameter = "time"), + "\n")) + cat("=============================================================================\n") + cat(paste0("PCs computed Genes used in calculation PCs Scaled by Variance Explained\n")) + pcs.compute <- GetCalcParam(object = object, + calculation = "RunPCA", + parameter = "pcs.compute") + n.gene <- length(GetCalcParam(object = object, + calculation = "RunPCA", + parameter = "pc.genes")) + cat(paste0(" ", + pcs.compute, + FillWhiteSpace(n = 20 - nchar(pcs.compute)), + n.gene, FillWhiteSpace(n = 35 - nchar(n.gene)), + GetCalcParam(object = object, + calculation = "RunPCA", + parameter = "weight.by.var"), + "\n")) + cat("-----------------------------------------------------------------------------\n") + cat("rev.pca \n") + cat(paste0(" ", + GetCalcParam(object = object, + calculation = "RunPCA", + parameter = "rev.pca"), + "\n")) + cat("-----------------------------------------------------------------------------\n") + cat("Full gene list can be accessed using \n GetCalcParam(object = object, calculation = \"RunPCA\", parameter = \"pc.genes\")") + } +} + +#' Print ICA Calculation Parameters +#' +#' Print the parameters chosen for the latest stored ICA calculation. +#' +#' @param object Seurat object +#' @param raw Print the entire contents of the calculation settings slot (calc.params) for the ICA +#' calculation. Default (FALSE) will print a nicely formatted summary. +#' @return No return value. Only prints to console. +#' @export +PrintICAParams <- function(object, raw = FALSE){ + if(is.null(object@calc.params$ICA)){ + stop("ICA has not been computed yet") + } + if (raw){ + print(object@calc.params$ICA) + } + else{ + cat(paste0("Parameters used in latest ICA calculation run on: ", + GetCalcParam(object = object, + calculation = "ICA", + parameter = "time"), + "\n")) + cat("=============================================================================\n") + cat(paste0("ICs computed \t Genes used in calculation \t ICA function \t rev.ica \n")) + ics.compute <- GetCalcParam(object = object, + calculation = "ICA", + parameter = "ics.compute") + n.genes <- length(GetCalcParam(object = object, + calculation = "ICA", + parameter = "ic.genes")) + ica.fxn <- GetCalcParam(object = object, + calculation = "ICA", + parameter = "ica.function") + rev.ica <- GetCalcParam(object = object, + calculation = "ICA", + parameter = "rev.ica") + cat(paste0(" ", + ics.compute, + FillWhiteSpace(n = 25 - nchar(ics.compute)), + n.genes, + FillWhiteSpace(n = 22 - nchar(n.genes)), + ica.fxn, + FillWhiteSpace(n = 15 - nchar(ica.fxn)), + rev.ica,"\n")) + cat("-----------------------------------------------------------------------------\n") + cat("Full gene list can be accessed using \n GetCalcParam(object = object, calculation = \"ICA\", parameter = \"ic.genes\")") + } +} + +#' Print TSNE Calculation Parameters +#' +#' Print the parameters chosen for the latest stored TSNE calculation. +#' +#' @param object Seurat object +#' @param raw Print the entire contents of the calculation settings slot (calc.params) for the +#' RunTSNE calculation. Default (FALSE) will print a nicely formatted summary. +#' @return No return value. Only prints to console. +#' @export +PrintTSNEParams <- function(object, raw = FALSE){ + if(is.null(object@calc.params$RunTSNE)){ + stop("TSNE has not been computed yet") + } + if (raw){ + print(object@calc.params$RunTSNE) + } + else{ + cat(paste0("Parameters used in latest TSNE calculation run on: ", + GetCalcParam(object = object, + calculation = "RunTSNE", + parameter = "time"), + "\n")) + cat("=============================================================================\n") + + if(is.null(GetCalcParam(object = object, + calculation = "RunTSNE", + parameter = "genes.use"))) { + reduction <- GetCalcParam(object = object, + calculation = "RunTSNE", + parameter = "reduction.use") + dim <- "Dims" + n.dim <- GetCalcParam(object = object, + calculation = "RunTSNE", + parameter = "dims.use") + } else if (!is.null(GetCalcParam(object = object, + calculation = "RunTSNE", + parameter = "distance.matrix"))){ + reduction <- "custom" + dim <- "Custom distance matrix" + } else { + reduction <- "None" + dim <- "Genes" + n.dim <- length(GetCalcParam(object = object, + calculation = "RunTSNE", + parameter = "genes.use")) + } + do.fast <- GetCalcParam(object = object, + calculation = "RunTSNE", + parameter = "do.fast") + dim.embed <- GetCalcParam(object = object, + calculation = "RunTSNE", + parameter = "dim.embed") + cat(paste0("Reduction use do.fast dim.embed\n")) + cat(paste0(" ", + reduction, + FillWhiteSpace(n = 19 - nchar(reduction)), + do.fast, + FillWhiteSpace(n = 20 - nchar(do.fast)), + dim.embed, + "\n")) + cat("-----------------------------------------------------------------------------\n") + cat(paste0(dim, " used in calculation\n")) + cat("=============================================================================\n") + if(reduction == "None"){ + cat(paste0(n.dim, " genes used: Full gene list can be accessed using \n GetCalcParam(object = object, calculation = \"RunTSNE\", parameter = \"genes.use\")")) + } else if (reduction == "custom") { + cat("Full matrix can be acccessed using \n GetCalcParam(object = object, calculation = \"RunTSNE\", parameter = \"distance.matrix\")") + } else { + cat(paste0(strwrap(paste(n.dim, "\n", collapse = " "), width = 80), + collapse = "\n")) + cat("\n\n") + } + } +} + +#' Print CCA Calculation Parameters +#' +#' Print the parameters chosen for the latest stored CCA calculation. +#' +#' @param object Seurat object +#' @param raw Print the entire contents of the calculation settings slot +#' (calc.params) for the RunCCA calculation. Default (FALSE) will print a nicely +#' formatted summary. +#' @return No return value. Only prints to console. +#' @export +PrintCCAParams <- function(object, raw = FALSE){ + if(is.null(object@calc.params$RunCCA)){ + stop("CCA has not been computed yet") + } + if (raw){ + print(object@calc.params$RunCCA) + } + else{ + cat(paste0("Parameters used in latest CCA calculation run on: ", + GetCalcParam(object = object, + calculation = "RunCCA", + parameter = "time"), "\n")) + cat("=============================================================================\n") + + cat(paste0("CCs computed Genes used in calculation scale.data\n")) + num.cc <- GetCalcParam(object = object, + calculation = "RunCCA", + parameter = "num.cc") + num.genes <- length(GetCalcParam(object = object, + calculation = "RunCCA", + parameter = "genes.use")) + cat(paste0(" ", + num.cc , + FillWhiteSpace(28 - nchar(num.cc)), + num.genes, + FillWhiteSpace(n = 24 - nchar(num.genes)), + GetCalcParam(object = object, + calculation = "RunCCA", + parameter = "scale.data"), + "\n")) + cat("-----------------------------------------------------------------------------\n") + g1 <- GetCalcParam(object = object, + calculation = "RunCCA", + parameter = "group1") + g2 <- GetCalcParam(object = object, + calculation = "RunCCA", + parameter = "group2") + if(nchar(g1) > 0){ + cat(paste0("group1", + FillWhiteSpace(n = 10), + "group2", + FillWhiteSpace(n = 10), + "group.by", + FillWhiteSpace(n = 10), + "rescale.groups\n")) + gb <- GetCalcParam(object = object, + calculation = "RunCCA", + parameter = "group.by") + if(length(g1) > 1) { + g1 <- "custom group" + } + if(length(g2) > 1){ + g2 <- "custom group" + } + rsg <- GetCalcParam(object = object, + calculation = "RunCCA", + parameter = "rescale.groups") + cat(paste0(g1, + FillWhiteSpace(n = 15 - nchar(g1)), + g2, + FillWhiteSpace(n = 18 - nchar(g2)), + gb, + FillWhiteSpace(n = 18 - nchar(rsg)), + rsg , + "\n")) + cat("-----------------------------------------------------------------------------\n") + } + if(!is.null(GetCalcParam(object = object, + calculation = "RunCCA", + parameter = "object.project"))){ + n1 <- GetCalcParam(object = object, + calculation = "RunCCA", + parameter = "object.project") + n2 <- GetCalcParam(object = object, + calculation = "RunCCA", + parameter = "object2.project") + cat("Object 1 Project Name Object 2 Project Name\n") + cat(paste0(" ", + n1, + FillWhiteSpace(n = 30 - nchar(n1)), + n2, + "\n")) + cat("-----------------------------------------------------------------------------\n") + } + if(g2 == "custom group" | g2 == "custom group"){ + cat("Group membership lists can be accessed using \n GetCalcParam(object = object, calculation = \"RunCCA\", parameter = \"group1/2\")\n") + } + cat("Full gene list can be accessed using \n GetCalcParam(object = object, calculation = \"RunCCA\", parameter = \"genes.use\")") + } +} + +#' Print Parameters Associated with CalcVarExpRatio +#' +#' Print the parameters chosen for CalcVarExpRatio. +#' +#' @param object Seurat object +#' @param raw Print the entire contents of the calculation settings slot +#' (calc.params) for CalcVarExpRatio. Default (FALSE) will print a nicely +#' formatted summary. +#' @return No return value. Only prints to console. +#' @export +PrintCalcVarExpRatioParams <- function(object, raw = FALSE){ + if(is.null(object@calc.params$CalcVarExpRatio)){ + stop("CalcVarExpRatio has not been computed yet") + } + if (raw){ + print(object@calc.params$CalcVarExpRatio) + } + else{ + cat(paste0("Parameters used in latest CalcVarExpRatio run on: ", + GetCalcParam(object = object, + calculation = "PCA", + parameter = "time"), + "\n")) + cat("=============================================================================\n") + cat(paste0("Reduction Type Grouping Variable \n")) + reduction <- GetCalcParam(object = object, + calculation = "CalcVarExpRatio", + parameter = "reduction.type") + grouping.var <- GetCalcParam(object = object, + calculation = "CalcVarExpRatio", + parameter = "grouping.var") + dims.use <- GetCalcParam(object = object, + calculation = "CalcVarExpRatio", + parameter = "dims.use") + cat(paste0(" ", + reduction, + FillWhiteSpace(n = 20 - nchar(reduction)), + grouping.var, + "\n")) + cat("-----------------------------------------------------------------------------\n") + cat("Dims used in calculation\n") + cat(paste0(strwrap(paste(dims.use, "\n", collapse = " "), width = 80), + collapse = "\n")) + } +} + +#' Print AlignSubspace Calculation Parameters +#' +#' Print the parameters chosen for the latest AlignSubspace calculation for each +#' stored aligned subspace. +#' +#' @param object Seurat object +#' @param raw Print the entire contents of the calculation settings slot +#' (calc.params) for the AlignSubspace calculation. Default (FALSE) will print a +#' nicely formatted summary. +#' @return No return value. Only prints to console. +#' @export +PrintAlignSubspaceParams <- function(object, raw = FALSE){ + to.print <- names(object@calc.params)[grepl("AlignSubspace.", + names(object@calc.params))] + if(length(to.print) == 0){ + stop("No stored aligned subspaces.") + } + for (i in to.print){ + if (raw){ + print(object@calc.params[[i]]) + } + else{ + cat(paste0("Parameters used in latest AlignSubspace calculation run on: ", + GetCalcParam(object = object, + calculation = i, + parameter = "time"), + "\n")) + cat("=============================================================================\n") + reduction <- GetCalcParam(object = object, + calculation = i, + parameter = "reduction.type") + grouping.var <- GetCalcParam(object = object, + calculation = i, + parameter = "grouping.var") + dims <- GetCalcParam(object = object, + calculation = i, + parameter = "dims.align") + n.genes <- GetCalcParam(object = object, + calculation = i, + parameter = "num.genes") + cat(paste0("Reduction use grouping.var num.genes\n")) + cat(paste0(" ", + reduction, + FillWhiteSpace(n = 19 - nchar(reduction)), + grouping.var, + FillWhiteSpace(n = 15 - nchar(dims)), + n.genes, + "\n")) + cat("-----------------------------------------------------------------------------\n") + cat("Dims aligned\n") + cat("=============================================================================\n") + cat(paste0(strwrap(paste(dims, "\n", collapse = " "), width = 80), + collapse = "\n")) + cat("\n") + } + } +} + +#' Print Diffusion Map Calculation Parameters +#' +#' Print the parameters chosen for the latest stored diffusion map calculation. +#' +#' @param object Seurat object +#' @param raw Print the entire contents of the calculation settings slot +#' (calc.params) for the RunDiffusion calculation. Default (FALSE) will print a +#' nicely formatted summary. +#' @return No return value. Only prints to console. +#' @export +PrintDMParams <- function(object, raw = FALSE){ + if(is.null(object@calc.params$RunDiffusion)){ + stop("Diffusion map has not been computed yet") + } + if (raw){ + print(object@calc.params$RunDiffusion) + } + else{ + cat(paste0("Parameters used in latest diffusion map calculation run on: ", + GetCalcParam(object = object, + calculation = "RunDiffusion", + parameter = "time"), "\n")) + cat("=============================================================================\n") + max.dim <- GetCalcParam(object = object, + calculation = "RunDiffusion", + parameter = "max.dim") + reduction <- GetCalcParam(object = object, + calculation = "RunDiffusion", + parameter = "reduction.use") + n.genes <- length(GetCalcParam(object = object, + calculation = "RunDiffusion", + parameter = "genes.use")) + scale.clip <- length(GetCalcParam(object = object, + calculation = "RunDiffusion", + parameter = "scale.clip")) + q.use <- GetCalcParam(object = object, + calculation = "RunDiffusion", + parameter = "q.use") + dims.use <- GetCalcParam(object = object, + calculation = "RunDiffusion", + parameter = "dims.use") + if(n.genes > 0){ + reduction <- "None" + } + cat(paste0("Reduction used DMs computed Quantile scale.clip \n")) + cat(paste0(" ", + reduction , + FillWhiteSpace(20 - nchar(reduction)), + max.dim, + FillWhiteSpace(n = 12 - nchar(max.dim)), + q.use, + FillWhiteSpace(n = 15 - nchar(q.use)), + scale.clip, + "\n")) + cat("-----------------------------------------------------------------------------\n") + if(reduction == "None"){ + dim <- "Genes" + } + else{ + dim <- "Dims" + } + cat(paste0(dim, " used in calculation\n")) + cat("=============================================================================\n") + if(reduction == "None"){ + cat(paste0(n.genes, " genes used: Full gene list can be accessed using \n GetCalcParam(object = object, calculation = \"RunDiffusion\", parameter = \"genes.use\")")) + } else { + cat(paste0(strwrap(paste(dims.use, "\n", collapse = " "), width = 80), + collapse = "\n")) + cat("\n\n") + } + } +} + + +#' Print SNN Construction Calculation Parameters +#' +#' Print the parameters chosen for the latest stored SNN calculation (via BuildSNN or FindClusters). +#' +#' @param object Seurat object +#' @param raw Print the entire contents of the calculation settings slot (calc.params) for the +#' BuildSNN calculation. Default (FALSE) will print a nicely formatted summary. +#' @return No return value. Only prints to console. +#' @export +PrintSNNParams <- function(object, raw = FALSE){ + if(is.null(object@calc.params$BuildSNN)){ + stop("SNN has not been computed yet") + } + if (raw){ + print(object@calc.params$BuildSNN) + } + else{ + cat(paste0("Parameters used in latest SNN calculation run on: ", + GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "time"), + "\n")) + cat("=============================================================================\n") + if(is.null(GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "genes.use"))) + { + reduction <- GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "reduction.type") + dim <- "Dims" + n.dim <- GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "dims.use") + } else if (!is.null(GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "distance.matrix"))) + { + reduction <- "custom" + dim <- "Custom distance matrix" + } else { + reduction <- "None" + dim <- "Genes" + n.dim <- length(GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "genes.use")) + } + cat(paste0("Reduction used k.param k.scale prune.SNN\n")) + k.param <- GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "k.param") + k.scale <- GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "k.scale") + prune.SNN <- GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "prune.SNN") + cat(paste0(" ", + reduction, + FillWhiteSpace(n = 20 - nchar(reduction)), + k.param, + FillWhiteSpace(n = 18 - nchar(k.param)), + k.scale, + FillWhiteSpace(n = 16 - nchar(k.scale)), + round(prune.SNN, 4), + "\n")) + cat("-----------------------------------------------------------------------------\n") + cat(paste0(dim, " used in calculation\n")) + cat("=============================================================================\n") + if(reduction == "None"){ + cat(paste0(n.dim, " genes used: Full gene list can be accessed using \n GetCalcParam(object = object, calculation = \"BuildSNN\", parameter = \"genes.use\")")) + } else if (reduction == "custom") { + cat("Full matrix can be acccessed using \n GetCalcParam(object = object, calculation = \"RunTSNE\", parameter = \"distance.matrix\")") + } else { + cat(paste0(strwrap(paste(n.dim, "\n", collapse = " "), width = 80), + collapse = "\n")) + cat("\n\n") + } + } +} + +#' Print FindClusters Calculation Parameters +#' +#' Print the parameters chosen for the latest FindClusters calculation for each +#' stored resolution. +#' +#' @param object Seurat object +#' @param resolution Optionally specify only a subset of resolutions to print +#' parameters for. +#' @param raw Print the entire contents of the calculation settings slot +#' (calc.params) for the FindClusters calculation. Default (FALSE) will print a +#' nicely formatted summary. +#' @return No return value. Only prints to console. +#' @export +PrintFindClustersParams <- function(object, resolution, raw = FALSE){ + to.print <- names(object@calc.params)[grepl("FindClusters", + names(object@calc.params))] + if(length(to.print) == 0){ + stop("No stored clusterings.") + } + for (i in to.print){ + if(!missing(resolution)){ + if(!ExtractField(i, 2, "res.") %in% resolution){ + next + } + } + if (raw){ + print(object@calc.params[[i]]) + } + else{ + cat(paste0("Parameters used in latest FindClusters calculation run on: ", + GetCalcParam(object = object, + calculation = i, + parameter = "time"), + "\n")) + resolution <- GetCalcParam(object = object, + calculation = i, + parameter = "resolution") + cat("=============================================================================\n") + cat(paste0("Resolution: ", resolution, "\n")) + cat("-----------------------------------------------------------------------------\n") + cat("Modularity Function Algorithm n.start n.iter\n") + modularity.fxn <- GetCalcParam(object = object, + calculation = i, + parameter = "modularity.fxn") + algorithm <- GetCalcParam(object = object, + calculation = i, + parameter = "algorithm") + n.start <- GetCalcParam(object = object, + calculation = i, + parameter = "n.start") + n.iter <- GetCalcParam(object = object, + calculation = i, + parameter = "n.iter") + cat(paste0(" ", + modularity.fxn, + FillWhiteSpace(n = 20 - nchar(modularity.fxn)), + algorithm, + FillWhiteSpace(n = 18 - nchar(algorithm)), + n.start, + FillWhiteSpace(n = 16 - nchar(n.start)), + n.iter, + "\n")) + cat("-----------------------------------------------------------------------------\n") + if (is.null(GetCalcParam(object = object, + calculation = i, + parameter = "genes.use"))) + { + reduction <- GetCalcParam(object = object, + calculation = i, + parameter = "reduction.type") + dim <- "Dims" + n.dim <- GetCalcParam(object = object, + calculation = i, + parameter = "dims.use") + } else if (!is.null(GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "distance.matrix"))) + { + reduction <- "custom" + dim <- "Custom distance matrix" + } else { + reduction <- "None" + dim <- "Genes" + n.dim <- length(GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "genes.use")) + } + cat(paste0("Reduction used k.param k.scale prune.SNN\n")) + k.param <- GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "k.param") + k.scale <- GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "k.scale") + prune.SNN <- GetCalcParam(object = object, + calculation = "BuildSNN", + parameter = "prune.SNN") + cat(paste0(" ", + reduction, + FillWhiteSpace(n = 20 - nchar(reduction)), + k.param, + FillWhiteSpace(n = 18 - nchar(k.param)), + k.scale, + FillWhiteSpace(n = 16 - nchar(k.scale)), + round(prune.SNN, 4), + "\n")) + cat("-----------------------------------------------------------------------------\n") + cat(paste0(dim, " used in calculation\n")) + cat("=============================================================================\n") + if(reduction == "None"){ + cat(paste0(n.dim, " genes used: Full gene list can be accessed using \n GetCalcParam(object = object, calculation = \"BuildSNN\", parameter = \"genes.use\")")) + } else if (reduction == "custom") { + cat("Full matrix can be acccessed using \n GetCalcParam(object = object, calculation = \"RunTSNE\", parameter = \"distance.matrix\")") + } else { + cat(paste0(strwrap(paste(n.dim, "\n", collapse = " "), width = 80), + collapse = "\n")) + cat("\n\n") + } + } + } +} + diff --git a/R/seurat.R b/R/seurat.R new file mode 100644 index 000000000..d9e5ccef9 --- /dev/null +++ b/R/seurat.R @@ -0,0 +1,94 @@ +################################################################################ +### Seurat + +#' The Seurat Class +#' +#' The Seurat object is the center of each single cell analysis. It stores all information +#' associated with the dataset, including data, annotations, analyes, etc. All that is needed +#' to construct a Seurat object is an expression matrix (rows are genes, columns are cells), which +#' should be log-scale +#' +#' Each Seurat object has a number of slots which store information. Key slots to access +#' are listed below. +#' +#' +#'@section Slots: +#' \describe{ +#' \item{\code{raw.data}:}{\code{"ANY"}, The raw project data } +#' \item{\code{data}:}{\code{"ANY"}, The expression matrix (log-scale) } +#' \item{\code{scale.data}:}{\code{"ANY"}, The scaled (after z-scoring +#' each gene) expression matrix. Used for PCA, ICA, and heatmap plotting} +#' \item{\code{var.genes}:}{\code{"vector"}, Variable genes across single cells } +#' \item{\code{is.expr}:}{\code{"numeric"}, Expression threshold to determine if a gene is expressed } +#' \item{\code{ident}:}{\code{"factor"}, The 'identity class' for each single cell } +#' \item{\code{meta.data}:}{\code{"data.frame"}, Contains information about metadata each cell, starting with # of genes detected (nGene) +#' the original identity class (orig.ident), user-provided information (through AddMetaData), etc. } +#' \item{\code{project.name}:}{\code{"character"}, Name of the project (for record keeping) } +#' \item{\code{dr}:}{\code{"list"}, List of stored dimensional reductions. Named by technique } +#' \item{\code{assay}:}{\code{"list"}, List of additional assays for multimodal analysis. Named by technique } +#' \item{\code{hvg.info}:}{\code{"data.frame"}, The output of the mean/variability analysis for all genes } +#' \item{\code{imputed}:}{\code{"data.frame"}, Matrix of imputed gene scores } +#' \item{\code{cell.names}:}{\code{"vector"}, Names of all single cells (column names of the expression matrix) } +#' \item{\code{cluster.tree}:}{\code{"list"}, List where the first element is a phylo object containing the +#' phylogenetic tree relating different identity classes } +#' \item{\code{snn}:}{\code{"dgCMatrix"}, Sparse matrix object representation of the SNN graph } +#' \item{\code{calc.params}:}{\code{"list"}, Named list to store all calculation related parameters choices} +#' \item{\code{kmeans}:}{\code{"ANY"}, Stores output of gene-based clustering from DoKMeans} +#' \item{\code{spatial}:}{\code{"ANY"},Stores internal data and calculations for spatial mapping of single cells} +#' \item{\code{misc}:}{\code{"ANY"}, Miscellaneous spot to store any data alongisde the object (for example, gene lists)} +#' \item{\code{version}:}{\code{"ANY"}, Version of package used in object creation} +#'} +#' @name seurat +#' @rdname seurat +#' @aliases seurat-class +#' @exportClass seurat +#' @importFrom Rcpp evalCpp +#' @useDynLib Seurat + +seurat <- setClass( + "seurat", + slots = c( + raw.data = "ANY", + data = "ANY", + scale.data = "ANY", + var.genes = "vector", + is.expr = "numeric", + ident = "factor", + meta.data = "data.frame", + project.name = "character", + dr = "list", + assay = "list", + hvg.info = "data.frame", + imputed = "data.frame", + cell.names = "vector", + cluster.tree = "list", + snn = "dgCMatrix", + calc.params = "list", + kmeans = "ANY", + spatial = "ANY", + misc = "ANY", + version = "ANY" + ) +) + +# Documentation +############### +#' @export +setMethod( + f = "show", + signature = "seurat", + definition = function(object) { + cat( + "An object of class", + class(object), + "in project", + object@project.name, + "\n", + nrow(x = object@data), + "genes across", + ncol(x = object@data), + "samples.\n" + ) + invisible(x = NULL) + } +) diff --git a/R/seuratFxns.R b/R/seuratFxns.R new file mode 100644 index 000000000..fd40910d9 --- /dev/null +++ b/R/seuratFxns.R @@ -0,0 +1,4 @@ + + + + diff --git a/R/snn.R b/R/snn.R new file mode 100644 index 000000000..88f667908 --- /dev/null +++ b/R/snn.R @@ -0,0 +1,237 @@ +#' @include seurat.R +NULL +#' SNN Graph Construction +#' +#' Constructs a Shared Nearest Neighbor (SNN) Graph for a given dataset. We +#' first determine the k-nearest neighbors of each cell (defined by k.param * +#' k.scale). We use this knn graph to construct the SNN graph by calculating the +#' neighborhood overlap (Jaccard distance) between every cell and its k.param * +#' k.scale nearest neighbors (defining the neighborhood for each cell as the +#' k.param nearest neighbors). +#' +#' @param object Seurat object +#' @param genes.use A vector of gene names to use in construction of SNN graph +#' if building directly based on expression data rather than a dimensionally +#' reduced representation (i.e. PCs). +#' @param reduction.type Name of dimensional reduction technique to use in +#' construction of SNN graph. (e.g. "pca", "ica") +#' @param dims.use A vector of the dimensions to use in construction of the SNN +#' graph (e.g. To use the first 10 PCs, pass 1:10) +#' @param k.param Defines k for the k-nearest neighbor algorithm +#' @param k.scale Granularity option for k.param +#' @param plot.SNN Plot the SNN graph +#' @param prune.SNN Sets the cutoff for acceptable Jaccard distances when +#' computing the neighborhood overlap for the SNN construction. Any edges with +#' values less than or equal to this will be set to 0 and removed from the SNN +#' graph. Essentially sets the strigency of pruning (0 --- no pruning, 1 --- +#' prune everything). +#' @param print.output Whether or not to print output to the console +#' @param distance.matrix Build SNN from distance matrix (experimental) +#' @param force.recalc Force recalculation of SNN. +#' @importFrom FNN get.knn +#' @importFrom igraph plot.igraph graph.adjlist graph.adjacency E +#' @importFrom Matrix sparseMatrix +#' @return Returns the object with object@@snn filled +#' @export +BuildSNN <- function( + object, + genes.use = NULL, + reduction.type = "pca", + dims.use = NULL, + k.param = 10, + k.scale = 10, + plot.SNN = FALSE, + prune.SNN = 1/15, + print.output = TRUE, + distance.matrix = NULL, + force.recalc = FALSE +) { + if (! is.null(x = distance.matrix)) { + data.use <- distance.matrix + } else if (is.null(x = genes.use) && is.null(x = dims.use)) { + genes.use <- object@var.genes + data.use <- t(x = as.matrix(x = object@data[genes.use, ])) + } else if (! is.null(x = dims.use)) { + data.use <- GetCellEmbeddings(object, reduction.type = reduction.type, + dims.use = dims.use) + } else if (!is.null(genes.use) && is.null(dims.use)) { + data.use <- t(x = as.matrix(x = object@data[genes.use, ])) + } else { + stop("Data error!") + } + parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("BuildSNN"))] + if (CalcInfoExists(object, "BuildSNN")){ + parameters.to.store$object <- NULL + old.parameters <- GetAllCalcParam(object, "BuildSNN") + old.parameters$time <- NULL + if(all(old.parameters %in% parameters.to.store)){ + warning("Build parameters exactly match those of already computed and stored SNN. To force recalculation, set force.recalc to TRUE.") + return(object) + } + } + object <- SetCalcParams(object = object, + calculation = "BuildSNN", + ... = parameters.to.store) + n.cells <- nrow(x = data.use) + if (n.cells < k.param) { + warning("k.param set larger than number of cells. Setting k.param to number of cells - 1.") + k.param <- n.cells - 1 + } + # find the k-nearest neighbors for each single cell + if (is.null(x = distance.matrix)) { + my.knn <- get.knn( + data <- as.matrix(x = data.use), + k = min(k.scale * k.param, n.cells - 1) + ) + nn.ranked <- cbind(1:n.cells, my.knn$nn.index[, 1:(k.param-1)]) + nn.large <- my.knn$nn.index + } else { + warning("Building SNN based on a provided distance matrix") + n <- nrow(x = distance.matrix) + k.for.nn <- k.param * k.scale + knn.mat <- matrix(data = 0, ncol = k.for.nn, nrow = n) + knd.mat <- knn.mat + for (i in 1:n){ + knn.mat[i, ] <- order(data.use[i, ])[1:k.for.nn] + knd.mat[i, ] <- data.use[i, knn.mat[i, ]] + } + nn.large <- knn.mat + nn.ranked <- knn.mat[, 2:k.param] + } + w <- CalcSNNSparse( + cell.names = object@cell.names, + k.param = k.param, + nn.large = nn.large, + nn.ranked = nn.ranked, + prune.SNN = prune.SNN, + print.output = print.output + ) + object@snn <- w + if (plot.SNN) { + if (length(x = object@dr$tsne@cell.embeddings) < 1) { + warning("Please compute a tSNE for SNN visualization. See RunTSNE().") + } else { + net <- graph.adjacency( + adjmatrix = w, + mode = "undirected", + weighted = TRUE, + diag = FALSE + ) + plot.igraph( + x = net, + layout = as.matrix(x = object@dr$tsne@cell.embeddings), + edge.width = E(graph = net)$weight, + vertex.label = NA, + vertex.size = 0 + ) + } + } + return(object) +} + +# Function to convert the knn graph into the snn graph. Stored in a sparse +# representation. + +# @param cell.names A vector of cell names which will correspond to the row/ +# column names of the SNN +# @param k.param Defines nearest neighborhood when computing NN graph +# @param nn.large Full KNN graph (computed with get.knn with k set to +# k.param * k.scale) +# @param nn.ranked Subset of full KNN graph that only contains the first +# k.param nearest neighbors. Used to define Jaccard +# distances between any two cells +# @param prune.snn Sets the cutoff for acceptable Jaccard distances when +# computing the neighborhood overlap for the SNN +# construction. Any edges with values less than or equal to +# this will be set to 0 and removed from the SNN graph. +# Essentially sets the strigency of pruning (0 --- no +# pruning, 1 --- prune everything). +# @param print.output Whether or not to print output to the console +# @return Returns an adjacency matrix representation of the SNN +# graph + +CalcSNNSparse <- function( + cell.names, + k.param, + nn.large, + nn.ranked, + prune.SNN, + print.output +) { + n.cells <- length(cell.names) + counter <- 1 + idx1 <- vector(mode = "integer", length = n.cells ^ 2 / k.param) + idx2 <- vector(mode = "integer", length = n.cells ^ 2 / k.param) + edge.weight <- vector(mode = "double", length = n.cells ^ 2 / k.param) + id <- 1 + # fill out the adjacency matrix w with edge weights only between your target + # cell and its k.scale*k.param-nearest neighbors + # speed things up (don't have to calculate all pairwise distances) + # define the edge weights with Jaccard distance + if (print.output) { + print("Constructing SNN") + pb <- txtProgressBar(min = 0, max = n.cells, style = 3) + } + for (i in 1:n.cells) { + for (j in 1:ncol(x = nn.large)) { + s <- intersect(x = nn.ranked[i, ], y = nn.ranked[nn.large[i, j], ]) + u <- union(nn.ranked[i, ], nn.ranked[nn.large[i, j], ]) + e <- length(x = s) / length(x = u) + if (e > prune.SNN) { + idx1[id] <- i + idx2[id] <- nn.large[i, j] + edge.weight[id] <- e + id <- id + 1 + } + } + if (print.output) { + setTxtProgressBar(pb = pb, value = i) + } + } + if (print.output) { + close(con = pb) + } + idx1 <- idx1[! is.na(x = idx1) & idx1 != 0] + idx2 <- idx2[! is.na(x = idx2) & idx2 != 0] + edge.weight <- edge.weight[! is.na(x = edge.weight) & edge.weight != 0] + w <- sparseMatrix( + i = idx1, + j = idx2, + x = edge.weight, + dims = c(n.cells, n.cells) + ) + diag(x = w) <- 1 + rownames(x = w) <- cell.names + colnames(x = w) <- cell.names + return(w) +} + +# This function calculates the pairwise connectivity of clusters. + +# @param object Seurat object containing the snn graph and cluster assignments +# @return matrix with all pairwise connectivities calculated + +CalcConnectivity <- function(object) { + SNN <- object@snn + cluster.names <- unique(x = object@ident) + num.clusters <- length(x = cluster.names) + connectivity <- matrix(data = 0, nrow = num.clusters, ncol = num.clusters) + rownames(x = connectivity) <- cluster.names + colnames(x = connectivity) <- cluster.names + n <- 1 + for (i in cluster.names) { + for (j in cluster.names[-(1:n)]) { + subSNN <- SNN[ + match(x = WhichCells(object = object, ident = i), colnames(x = SNN)), + match(x = WhichCells(object = object, ident = j), rownames(x = SNN)) + ] + if (is.object(x = subSNN)) { + connectivity[i, j] <- sum(subSNN) / (nrow(x = subSNN) * ncol(x = subSNN)) + } else { + connectivity[i, j] <- mean(x = subSNN) + } + } + n <- n + 1 + } + return(connectivity) +} diff --git a/R/spatial.R b/R/spatial.R new file mode 100644 index 000000000..dadb81920 --- /dev/null +++ b/R/spatial.R @@ -0,0 +1,404 @@ +#' Get cell centroids +#' +#' Calculate the spatial mapping centroids for each cell, based on previously +#' calculated mapping probabilities for each bin. +#' +#' Currently, Seurat assumes that the tissue of interest has an 8x8 bin +#' structure. This will be broadened in a future release. +#' +#' @param object Seurat object +#' @param cells.use Cells to calculate centroids for (default is all cells) +#' @param get.exact Get exact centroid (Default is TRUE). If FALSE, identify +#' the single closest bin. +#' +#' @return Data frame containing the x and y coordinates for each cell +#' centroid. +#' +#' @export +#' +GetCentroids <- function(object, cells.use = NULL, get.exact = TRUE) { + cells.use <- SetIfNull(x = cells.use, default = colnames(x = object@spatial@finalprob)) + #Error checking + cell.names <- intersect(x = cells.use, y = colnames(x = object@spatial@finalprob)) + if (length(x = cell.names) != length(x = cells.use)) { + print(paste( + "Error", + setdiff(x = cells.use, y = colnames(x = object@spatial@finalprob)), + " have not been mapped" + )) + return(0) + } + if (get.exact) { + my.centroids <- data.frame(t(x = sapply( + X = colnames(x = object@data), + FUN = function(x) { + return(ExactCellCentroid(cell.probs = object@spatial@finalprob[, x])) + } + ))) + } else { + my.centroids <- data.frame(t(x = sapply( + X = colnames(x = object@data), + FUN = function(x) { + return(CellCentroid(cell.probs = object@spatial@finalprob[, x])) + } + ))) + } + colnames(x = my.centroids) <- c("bin.x","bin.y") + return(my.centroids) +} + +#' Quantitative refinement of spatial inferences +#' +#' Refines the initial mapping with more complex models that allow gene +#' expression to vary quantitatively across bins (instead of 'on' or 'off'), +#' and that also considers the covariance structure between genes. +#' +#' Full details given in spatial mapping manuscript. +#' +#' @param object Seurat object +#' @param genes.use Genes to use to drive the refinement procedure. +#' +#' @return Seurat object, where mapping probabilities for each bin are stored +#' in object@@final.prob +#' +#' @import fpc +#' +#' @export +#' +RefinedMapping <- function(object, genes.use) { + genes.use <- intersect(x = genes.use, y = rownames(x = object@imputed)) + cells.max <- t(x = sapply( + X = colnames(object@data), + FUN = function(x) { + return(ExactCellCentroid(object@spatial@finalprob[, x])) + } + )) + all.mu <- sapply( + X = genes.use, + FUN = function(gene) { + return(sapply(X = 1:64, FUN = function(bin) { + mean(x = as.numeric(x = object@imputed[ + gene, # Row + FetchClosest( + bin = bin, + all.centroids = cells.max, + num.cell = 2*length(x = genes.use) + ) # Column + ])) + })) + } + ) + all.cov <- list() + for (x in 1:64) { + all.cov[[x]] <- cov( + x = t( + x = object@imputed[ + genes.use, # Row + FetchClosest( + bin = x, + all.centroids = cells.max, + num.cell = 2*length(x = genes.use) + ) # Columns + ] + ) + ) + } + mv.probs <- sapply( + X = colnames(x = object@data), + FUN = function(my.cell) { + return(sapply(X = 1:64, FUN = function(bin) { + return(slimdmvnorm( + x = as.numeric(x = object@imputed[genes.use, my.cell]), + mean = as.numeric(x = all.mu[bin, genes.use]), + sigma = all.cov[[bin]]) + ) + })) + } + ) + mv.final <- exp( + x = sweep( + x = mv.probs, + MARGIN = 2, + STATS = apply(X = mv.probs, MARGIN = 2, FUN = LogAdd) + ) + ) + object@spatial@finalprob <- data.frame(mv.final) + return(object) +} + +#' Infer spatial origins for single cells +#' +#' Probabilistically maps single cells based on (imputed) gene expression +#' estimates, a set of mixture models, and an in situ spatial reference map. +#' +#' @param object Seurat object +#' @param cells.use Which cells to map +#' +#' @return Seurat object, where mapping probabilities for each bin are stored +#' in object@@final.prob +#' +#' @export +#' +InitialMapping <- function(object, cells.use = NULL) { + cells.use <- SetIfNull(x = cells.use, default = colnames(x = object@data)) + every.prob <- sapply( + X = cells.use, + FUN = function(x) { + return(MapCell( + object = object, + cell.name = x, + do.plot = FALSE, + safe.use = FALSE + )) + } + ) + object@spatial@finalprob <- data.frame(every.prob) + rownames(x = object@spatial@finalprob) <- paste0("bin.", rownames(x = object@spatial@finalprob)) + return(object) +} + +#return cell centroid after spatial mappings (both X and Y) +#' @export +CellCentroid <- function(cell.probs) { + centroid.x <- XCellCentroid(cell.probs = cell.probs) + centroid.y <- YCellCentroid(cell.probs = cell.probs) + centroid.bin <- 8 * (centroid.y - 1) + centroid.x + return(centroid.bin) +} + +#return x-coordinate cell centroid +#' @export +XCellCentroid <- function(cell.probs) { + centroid.x <- round(x = sum(sapply( + X = 1:64, + FUN = function(x) { + return((x - 1) %% 8 + 1) + } + ) * cell.probs)) + return(centroid.x) +} + +#return y-coordinate cell centroid +#' @export +YCellCentroid <- function(cell.probs) { + centroid.y <- round(x = sum(sapply( + X = 1:64, + FUN = function(x) { + return((x - 1) %/% 8 + 1) + } + ) * cell.probs)) + return(centroid.y) +} + +#return x and y-coordinate cell centroid +#' @export +ExactCellCentroid <- function(cell.probs) { + # centroid.x=(sum(sapply(1:64,function(x)(x-1)%%8+1)*cell.probs)) + centroid.x <- XCellCentroid(cell.probs = cell.probs) + # centroid.y=(sum(sapply(1:64,function(x)(x-1)%/%8+1)*cell.probs)) + centroid.y <- YCellCentroid(cell.probs = cell.probs) + return(c(centroid.x, centroid.y)) +} + +#' Build mixture models of gene expression +#' +#' Models the imputed gene expression values as a mixture of gaussian +#' distributions. For a two-state model, estimates the probability that a given +#' cell is in the 'on' or 'off' state for any gene. Followed by a greedy +#' k-means step where cells are allowed to flip states based on the overall +#' structure of the data (see Manuscript for details) +#' +#' @param object Seurat object +#' @param gene Gene to fit +#' @param do.k Number of modes for the mixture model (default is 2) +#' @param num.iter Number of 'greedy k-means' iterations (default is 1) +#' @param do.plot Plot mixture model results +#' @param genes.use Genes to use in the greedy k-means step (See manuscript for details) +#' @param start.pct Initial estimates of the percentage of cells in the 'on' +#' state (usually estimated from the in situ map) +#' +#' @return A Seurat object, where the posterior of each cell being in the 'on' +#' or 'off' state for each gene is stored in object@@spatial@@mix.probs +#' +#' @importFrom mixtools normalmixEM +#' +#' @export +#' +FitGeneK <- function( + object, + gene, + do.k = 2, + num.iter = 1, + do.plot = FALSE, + genes.use = NULL, + start.pct = NULL +) { + data <- object@imputed + data.use <- data[gene, ] + names(x = data.use) <- colnames(x = data.use) + scale.data <- t(x = scale(x = t(x = object@imputed))) + genes.use <- SetIfNull(x = genes.use, default = rownames(x = scale.data)) + genes.use <- genes.use[genes.use %in% rownames(x = scale.data)] + scale.data <- scale.data[genes.use, ] + data.cut <- as.numeric(x = data.use[gene, ]) + cell.ident <- as.numeric(x = cut(x = data.cut, breaks = do.k)) + if (! (is.null(x = start.pct))) { + cell.ident <- rep.int(x = 1, times = length(x = data.cut)) + cell.ident[data.cut > quantile(x = data.cut, probs = 1 - start.pct)] <- 2 + } + cell.ident <- order(tapply( + X = as.numeric(x = data.use), + INDEX = cell.ident, + FUN = mean + ))[cell.ident] + ident.table <- table(cell.ident) + if (num.iter > 0) { + for (i2 in 1:num.iter) { + cell.ident <- iter.k.fit( + scale.data = scale.data, + cell.ident = cell.ident, + data.use = data.use + ) + ident.table <- table(cell.ident) + } + } + ident.table <- table(cell.ident) + raw.probs <- t( + x = sapply( + X = data.use, + FUN = function(y) { + return(unlist( + x = lapply( + X = 1:do.k, + FUN = function(x) { + return( + (ident.table[x] / sum(ident.table)) * dnorm( + x = y, + mean = mean(x = as.numeric(x = data.use[cell.ident == x])), + sd = sd(x = as.numeric(x = data.use[cell.ident == x])) + ) + ) + } + ) + )) + } + ) + ) + norm.probs <- raw.probs / apply(X = raw.probs, MARGIN = 1, FUN = sum) + colnames(x = norm.probs) <- unlist( + x = lapply( + X = 1:do.k, + FUN = function(x) { + paste(gene, x - 1, "post", sep=".") + } + ) + ) + norm.probs <- cbind(norm.probs, cell.ident) + colnames(x = norm.probs)[ncol(x = norm.probs)] <- paste0(gene, ".ident") + new.mix.probs <- data.frame( + SubsetColumn( + data = object@spatial@mix.probs, + code = paste0(gene, "."), + invert = TRUE + ), + row.names = rownames(x = object@spatial@mix.probs) + ) + colnames(x = new.mix.probs)[1] <- "nGene" + object@spatial@mix.probs <- cbind(new.mix.probs, norm.probs) + if (do.plot) { + nCol <- 2 + num.row <- floor(x = (do.k + 1) / nCol - (1e-5)) + 1 + hist( + x = as.numeric(x = data.use), + probability = TRUE, + ylim = c(0, 1), + xlab = gene, + main = gene + ) + for (i in 1:do.k) { + lines( + x = seq(from = -10, to = 10, by = 0.01), + y = (ident.table[i] / sum(ident.table)) * dnorm( + x = seq(from = -10, to = 10, by = 0.01), + mean = mean(x = as.numeric(x = data.use[cell.ident == i])), + sd = sd(x = as.numeric(x = data.use[cell.ident == i])) + ), + col=i, + lwd=2 + ) + } + } + return(object) +} + +# Documentation +############### +#Internal, not documented for now +#' @export +FitGeneMix <- function( + object, + gene, + do.k = 3, + use.mixtools = TRUE, + do.plot = FALSE, + plot.with.imputed = TRUE, + min.bin.size = 10 +) { + data.fit <- as.numeric(x = object@imputed[gene, ]) + mixtools.fit <- normalmixEM(x = data.fit, k = do.k) + comp.order <- order(mixtools.fit$mu) + mixtools.posterior <- data.frame(mixtools.fit$posterior[, comp.order]) + colnames(x = mixtools.posterior) <- unlist( + x = lapply( + X = 1:do.k, + FUN = function(x) { + return(paste(gene, x - 1, "post", sep=".")) + } + ) + ) + #mixtools.mu=data.frame(mixtools.fit$mu[comp.order]) + #mixtools.sigma=data.frame(mixtools.fit$sigma[comp.order]) + #mixtools.alpha=data.frame(mixtools.fit$lambda[comp.order]) + #rownames(mixtools.mu)=unlist(lapply(1:do.k,function(x)paste(gene,x-1,"mu",sep="."))) + #rownames(mixtools.sigma)=unlist(lapply(1:do.k,function(x)paste(gene,x-1,"sigma",sep="."))) + #rownames(mixtools.alpha)=unlist(lapply(1:do.k,function(x)paste(gene,x-1,"alpha",sep="."))) + #object@mix.mu = rbind(minusr(object@mix.mu,gene), mixtools.mu); + #object@mix.sigma = rbind(minusr(object@mix.sigma,gene), mixtools.sigma); + #o#bject@mu.alpha =rbind(minusr(object@mu.alpha,gene), mixtools.alpha); + if (do.plot) { + nCol <- 2 + num.row <- floor(x = (do.k + 1) / nCol - (1e-5)) + 1 + par(mfrow = c(num.row, nCol)) + plot.mixEM(x = mixtools.fit, which = 2) + plot.data <- as.numeric(x = object@imputed[gene, ]) + if (! plot.with.imputed) { + plot.data <- as.numeric(x = object@data[gene, ]) + } + unlist( + x = lapply( + X = 1:do.k, + FUN = function(x) { + plot( + x = plot.data, + y = mixtools.posterior[, x], + ylab = paste0("Posterior for Component ", x - 1), + xlab = gene, + main = gene + ) + } + ) + ) + } + new.mix.probs <- data.frame( + SubsetColumn( + data = object@spatial@mix.probs, + code = paste0(gene, "."), + invert = TRUE + ), + row.names = rownames(x = object@spatial@mix.probs) + ) + colnames(x = new.mix.probs)[1] <- "nGene" + object@spatial@mix.probs <- cbind(new.mix.probs, mixtools.posterior) + return(object) +} + diff --git a/R/spatial_internal.R b/R/spatial_internal.R new file mode 100644 index 000000000..0aa51fdd7 --- /dev/null +++ b/R/spatial_internal.R @@ -0,0 +1,308 @@ +#internal function for spatial mapping +ShiftCell <- function(bin, x, y) { + bin.y <- (bin - 1) %/% 8 + 1 + bin.x <- (bin - 1) %% 8 + 1 + new.x <- MinMax(data = bin.x + x, min = 1, max = 8) + new.y <- MinMax(data = bin.y + y, min = 1, max = 8) + new.bin <- 8 * (new.y - 1) + new.x + return(new.bin) +} + +NeighborCells <- function(bin) { + return(unique(x = c( + bin, + ShiftCell(bin = bin, x = 0, y = 1), + ShiftCell(bin = bin, x = 1, y = 0), + ShiftCell(bin = bin, x = -1, y = 0), + ShiftCell(bin = bin, x = 0, y = -1) + ))) +} + +AllNeighborCells <- function(bin, dist = 1) { + all.comb <- expand.grid(rep(x = list(-dist:dist), 2)) + return(unique(x = unlist(x = lapply( + X = 1:nrow(x = all.comb), + FUN = function(x) { + return(ShiftCell(bin = bin, x = all.comb[x, 1], y = all.comb[x, 2])) + })))) +} + +#FetchClosest bin, used internally in spatial mapping +FetchClosest <- function(bin, all.centroids, num.cell) { + bin.y <- (bin - 1) %/% 8 + 1 + bin.x <- (bin - 1) %% 8 + 1 + all.centroids <- rbind(all.centroids, c(bin.x, bin.y)) + all.dist <- as.matrix(x = dist(x = all.centroids)) + return(names(x = sort(x = all.dist[nrow(x = all.dist), ]))[2:(num.cell + 2)]) +} + +#calculate refined mapping probabilites based on multivariate distribution +slimdmvnorm <- function (x, mean = rep(0, p), sigma = diag(p), log = FALSE) { + x <- matrix(data = x, ncol = length(x = x)) + p <- ncol(x = x) + dec <- tryCatch(chol(x = sigma), error = function(e) e) + tmp <- backsolve(r = dec, t(x = x) - mean, transpose = TRUE) + rss <- colSums(tmp ^ 2) + logretval <- -sum(log(x = diag(x = dec))) - 0.5 * p * log(x = 2 * pi) - 0.5 * rss + names(x = logretval) <- rownames(x = x) + return(logretval) +} + + +# Documentation +############### +#Internal, not documented for now +CalcInsitu <- function( + object, + gene, + do.plot = TRUE, + do.write = FALSE, + write.dir = "~/window/insitu/", + col.use = bwCols, + do.norm = FALSE, + cells.use = NULL, + do.return = FALSE, + probs.min = 0, + do.log = FALSE, + use.imputed = FALSE, + bleach.use = 0 +) { + cells.use <- SetIfNull(x = cells.use, default = colnames(x = object@spatial@final.prob)) + probs.use <- object@spatial@final.prob + if (use.imputed) { + data.use <- exp(x = object@imputed) - 1 + } else { + data.use <- exp(object@data) - 1 + } + cells.use <- cells.use[cells.use %in% colnames(x = probs.use)] + cells.use <- cells.use[cells.use %in% colnames(x = data.use)] + #insilico.stain=matrix(unlist(lapply(1:64,function(x) sum(probs.use[x,]*data.use[gene,]))),nrow=8,ncol=8) + insilico.vector <- unlist( + x = lapply( + X = 1:64, + FUN = function(x) { + return(sum( + as.numeric(x = probs.use[x, cells.use]) * + as.numeric(x = data.use[gene, cells.use]) + )) + } + ) + ) + probs.total <- apply(X = probs.use, MARGIN = 1, FUN = sum) + probs.total[probs.total < probs.min] <- probs.min + insilico.stain <- (matrix(data = insilico.vector / probs.total, nrow=8, ncol=8)) + if (do.log) { + insilico.stain <- log(x = insilico.stain + 1) + } + if (bleach.use > 0) { + insilico.stain <- insilico.stain - bleach.use + insilico.stain <- MinMax(data = insilico.stain, min=0, max=1e6) + } + if (do.norm) { + insilico.stain <- (insilico.stain - min(insilico.stain)) / + (max(insilico.stain) - min(insilico.stain)) + } + title.use <- gene + if (gene %in% colnames(x = object@spatial@insitu.matrix)) { + pred.use <- prediction( + predictions = insilico.vector / probs.total, + labels = object@spatial@insitu.matrix[, gene], + label.ordering = 0:1 + ) + perf.use <- performance(prediction.obj = pred.use, measure = "auc") + auc.use <- round(x = perf.use@y.values[[1]], digits = 3) + title.use <- paste(gene, sep=" ") + } + if (do.write) { + write.table( + x = insilico.stain, + file = paste0(write.dir, gene, ".txt"), + quote=FALSE, + row.names=FALSE, + col.names=FALSE + ) + } + if (do.plot) { + aheatmap( + x = insilico.stain, + Rowv = NA, + Colv = NA, + col = col.use, + main=title.use + ) + } + if (do.return) { + return(as.vector(x = insilico.stain)) + } + return(object) +} + + + +# Documentation +############### +#Not documented for now +#' @export +PosteriorPlot <- function(object, name) { + post.names <- colnames(x = SubsetColumn(data = object@spatial@mix.probs, code = name)) + VlnPlot( + object = object, + features.plot = post.names, + inc.first=TRUE, + inc.final=TRUE, + by.k=TRUE + ) +} + +# Documentation +############### +#Internal, not documented for now +map.cell.score <- function(gene, gene.value, insitu.bin, mu, sigma, alpha) { + code.1 <- paste(gene, insitu.bin, sep=".") + mu.use <- mu[paste(code.1, "mu", sep="."), 1] + sigma.use <- sigma[paste(code.1, "sigma", sep="."), 1] + alpha.use <- alpha[paste(code.1, "alpha", sep="."), 1] + bin.prob <- unlist( + x = lapply( + X = 1:length(x = insitu.bin), + FUN = function(x) { + return(dnorm( + x = gene.value, + mean = mu.use[x], + sd = sigma.use[x], + log = TRUE) + log(x = alpha.use[x]) + ) + } + ) + ) + return(bin.prob) +} + +#Internal, not documented for now +#' @export +#' +MapCell <- function( + object, + cell.name, + do.plot = FALSE, + safe.use = TRUE, + text.val = NULL, + do.rev = FALSE +) { + insitu.matrix <- object@spatial@insitu.matrix + insitu.genes <- colnames(x = insitu.matrix) + insitu.genes <- insitu.genes[insitu.genes %in% rownames(x = object@imputed)] + insitu.use <- insitu.matrix[, insitu.genes] + imputed.use <- object@imputed[insitu.genes, ] + if (safe.use) { + safe_fxn <- LogAdd + } else { + safe_fxn <- sum + } + all.needed.cols <- unique( + x = unlist( + x = lapply( + X = insitu.genes, + FUN = function(x) { + return(paste(x, insitu.use[, x], "post", sep=".")) + } + ) + ) + ) + missing.cols <- which(! (all.needed.cols %in% colnames(x = object@spatial@mix.probs))) + if (length(x = missing.cols) > 0) { + stop(paste( + "Error: ", + all.needed.cols[missing.cols], + "is missing from the mixture fits" + )) + } + all.probs <- data.frame( + sapply( + X = insitu.genes, + FUN = function(x) { + return( + log(x = as.numeric(x = object@spatial@mix.probs[ + cell.name, # Row + paste(x, insitu.use[, x], "post", sep=".") # Column + ]))) + } + ) + ) + scale.probs <- t( + x = t(x = all.probs) - apply(X = t(x = all.probs), MARGIN = 1, FUN = LogAdd) + ) + scale.probs[scale.probs < (-9.2)] <- (-9.2) + #head(scale.probs) + total.prob <- exp(x = apply(X = scale.probs, MARGIN = 1, FUN = safe_fxn)) + total.prob <- total.prob / sum(total.prob) + if (do.plot) { + #plot(total.prob,main=cell.name) + par(mfrow = c(1, 2)) + txt.matrix <- matrix(data = rep(x = "", 64), nrow=8, ncol=8) + if (! is.null(x = text.val)) { + txt.matrix[text.val] <- "X" + } + if (do.rev) { + scale.probs <- scale.probs[unlist( + x = lapply( + X = 0:7, + FUN = function(x) { + return(seq(from = 1, to = 57, by = 8) + x) + } + ) + ), ] + } + aheatmap( + x = matrix(data = total.prob, nrow=8, ncol=8), + Rowv = NA, + Colv = NA, + txt = txt.matrix, + col = bwCols + ) + aheatmap(x = scale.probs, Rowv = NA, Colv = NA) + ResetPar() + } + return(total.prob) +} + +# Set up class to hold spatial info +spatial.info <- setClass( + Class = "spatial.info", + slots = list( + mix.probs = "data.frame", + mix.param = "data.frame", + final.prob = "data.frame", + insitu.matrix = "data.frame" + ) +) + +#Internal, not documented for now +iter.k.fit <- function(scale.data, cell.ident, data.use) { + means.all <- sapply( + X = sort(x = unique(x = cell.ident)), + FUN = function(x) { + return(apply(X = scale.data[, cell.ident == x], MARGIN = 1, FUN = mean)) + } + ) + all.dist <- data.frame( + t(x = sapply( + X = 1:ncol(x = scale.data), + FUN = function(x) { + return(unlist(x = lapply( + X = sort(x = unique(x = cell.ident)), + FUN = function(y) { + return(dist(x = rbind(scale.data[, x], means.all[, y]))) + } + ))) + } + )) + ) + cell.ident <- apply(X = all.dist, MARGIN = 1, FUN = which.min) + cell.ident <- order(tapply( + X = as.numeric(x = data.use), + INDEX = cell.ident, + FUN = mean + ))[cell.ident] + return(cell.ident) +} diff --git a/R/tSNE_project.R b/R/tSNE_project.R new file mode 100644 index 000000000..c8e878bdb --- /dev/null +++ b/R/tSNE_project.R @@ -0,0 +1,111 @@ + +project_map <- function( + z, + x_old, + sum_X_old, + x_old_tsne, + P_tol = 5e-6, + perplexity = 30 +) { + sum_z <- sum(z ^ 2) + #x_old=test2@pca.rot[,1:5] + #sum_X_old=rowSums((x_old^2)) + D_org <- sum_z + (-2 * as.matrix(x = x_old) %*% t(x = z) + sum_X_old) + P <- d2p_cell(D = D_org, u = perplexity) + nn_points <- which(x = P> P_tol) # Use only a small subset of points to comupute embedding. This keeps all the points that are proximal to the new point + X_nn_set <- x_old[nn_points, ] #Original points + y_nn_set <- x_old_tsne[nn_points, ] #Computed embeddings + P_nn_set <- P[nn_points, ] #Probabilities + y_new0 <- ( + t(x = as.matrix(x = y_nn_set)) %*% + t(x = as.matrix(x = rbind(P_nn_set, P_nn_set))) + )[, 1] #Initial guess of point as a weighted average + sink("/dev/null") + y_new <- optim( + par = y_new0, + fn = KullbackFun, + gr = NULL, + y_nn_set, + P_nn_set, + method = "Nelder-Mead" + ) + sink() + #plot(test2@tsne.rot) + #points(y_new$par[1],y_new$par[2],col="red",cex=1,pch=16) + #points(test2@tsne.rot[cell.num,1],test2@tsne.rot[cell.num,2],col="blue",cex=1,pch=16) + #return(dist(as.matrix(rbind(y_new$par,test2@tsne.rot[cell.num,])))) + return(y_new$par) +} + +d2p_cell <- function(D, u = 15, tol = 1e-4) { + betamin = -Inf + betamax = Inf + tries = 0 + tol = 1e-4 + beta = 1 + beta.list <- Hbeta(D = D, beta = beta) + h <- beta.list[[1]] + thisP <- beta.list[[2]] + flagP <- beta.list[[3]] + hdiff <- h - log(x = u) + while (abs(x = hdiff) > tol && tries < 50) { + if (hdiff > 0) { + betamin <- beta + if (betamax == Inf) { + beta <- beta * 2 + } else { + beta <- (beta + betamax) / 2 + } + } else { + betamax <- beta + if (betamin == -Inf) { + beta <- beta / 2 + } else { + beta <- (beta + betamin) / 2 + } + } + beta.list <- Hbeta(D = D, beta = beta) + h <- beta.list[[1]] + thisP <- beta.list[[2]] + flagP <- beta.list[[3]] + hdiff <- h - log(x = u) + tries <- tries + 1 + } + # set the final row of p + P <- thisP + #Check if there are at least 10 points that are highly similar to the projected point + return(P) +} + +KullbackFun <- function(z, y, P) { + #Computes the Kullback-Leibler divergence cost function for the embedding x in a tSNE map + #%P = params{1}; %Transition probabilities in the original space. Nx1 vector + #%y = params{2}; %tSNE embeddings of the training set. Nx2 vector + print(z) + print(dim(x = y)) + Cost0 = sum(P * log(x = P)) #Constant part of the cost function + #Compute pairwise distances in embedding space + sum_z <- sum(z ^ 2) + sum_y <- rowSums(x = (y ^ 2)) + D_yz <- sum_z +( + -2 * as.matrix(x = y) %*% t(x = matrix(data = z, nrow = 1)) + sum_y + ) + Q <- 1 / (1 + D_yz) + Q <- Q / sum(Q) #Transition probabilities in the embedded space + Cost <- Cost0 - sum(P * log(x = Q)) #% - 100 * sum(Q .* log(Q)); + return(Cost) +} + +Hbeta <- function(D, beta) { + flagP <- 1 + P <- exp(x = -D * beta) + sumP <- sum(P) + if (sumP < 1e-8) { #In this case it means that no point is proximal. + P <- ones(length(x = P), 1) / length(x = P) + sumP <- sum(P) + flagP <- 0 + } + H <- log(x = sumP) + beta * sum(D * P) / sumP + P <- P / sumP + return(list(H, P, flagP)) +} diff --git a/R/utilities.R b/R/utilities.R new file mode 100644 index 000000000..938467c63 --- /dev/null +++ b/R/utilities.R @@ -0,0 +1,714 @@ +#' Shuffle a vector +#' @param x A vector +#' @return A vector with the same values of x, just in random order +#' @export +#' +Shuffle <- function(x) { + return(x[base::sample.int( + n = base::length(x = x), + size = base::length(x = x), + replace = FALSE + )]) +} + +#' Remove data from a table +#' +#' This function will remove any rows from a data frame or matrix +#' that contain certain values +#' +#' @param to.remove A vector of values that indicate removal +#' @param data A data frame or matrix +#' +#' @return A data frame or matrix with values removed by row + +#' @export +#' +RemoveFromTable <- function(to.remove, data) { + remove.indecies <- apply( + X = data, + MARGIN = 2, + FUN = function(col) { + return(which(x = col %in% to.remove)) + } + ) + remove.indecies <- unlist(x = remove.indecies) + remove.indecies <- as.numeric(x = remove.indecies) + if (length(x = remove.indecies) == 0) { + return(data) + } else { + return(data[-remove.indecies, ]) + } +} + +#' Make object sparse +#' +#' Converts stored data matrices to sparse matrices to save space. Converts +#' object@@raw.data and object@@data to sparse matrices. +#' @param object Seurat object +#' @return Returns a seurat object with data converted to sparse matrices. +#' @import Matrix +#' @export +#' +MakeSparse <- function(object) { + if (class(object@raw.data) == "data.frame") { + object@raw.data <- as.matrix(x = object@raw.data) + } + if (class(object@data) == "data.frame") { + object@data <- as.matrix(x = object@data) + } + object@raw.data <- as(object = object@raw.data, Class = "dgCMatrix") + object@data <- as(object = object@data, Class = "dgCMatrix") + return(object) +} + +#' Update old Seurat object to accomodate new features +#' +#' Updates Seurat objects to new structure for storing data/calculations. +#' +#' @param object Seurat object +#' +#' @return Returns a Seurat object compatible with latest changes +#' +#' @export +#' +UpdateSeuratObject <- function(object) { + if (.hasSlot(object, "version")) { + if(packageVersion("Seurat") >= package_version("2.0.0")){ + cat("Object representation is consistent with the most current Seurat version.") + return(object) + } + } + seurat.version <- packageVersion("Seurat") + new.object <- new("seurat", + raw.data = object@raw.data, + version = seurat.version) + new.slots <- slotNames(new.object) + for(s in new.slots){ + new.object <- FillSlot(slot.name = s, old.object = object, + new.object = new.object) + } + + # Copy over old slots if they have info stored + if(length(object@kmeans.obj) > 0){ + new.object@kmeans@gene.kmeans.obj <- object@kmeans.obj + } + if(length(object@kmeans.col) >0 ){ + new.object@kmeans@cell.kmeans.obj <- object@kmeans.col + } + if(length(object@data.info) > 0){ + new.object@meta.data <- object@data.info + } + if(length(object@mean.var) > 0){ + new.object@hvg.info <- object@mean.var + colnames(new.object@hvg.info) <- c("gene.mean", "gene.dispersion", "gene.dispersion.scaled") + } + if(length(object@mix.probs) > 0 | length(object@mix.param) > 0 | + length(object@final.prob) > 0 | length(object@insitu.matrix) > 0) { + new.object@spatial <- new("spatial.info", + mix.probs = object@mix.probs, + mix.param = object@mix.param, + final.prob = object@final.prob, + insitu.matrix = object@insitu.matrix) + } + + # Conversion from development versions prior to 2.0.0 + if ((.hasSlot(object, "dr"))) { + for (i in 1:length(object@dr)) { + new.object@dr[[i]]@cell.embeddings <- object@dr[[i]]@rotation + new.object@dr[[i]]@gene.loadings <- object@dr[[i]]@x + new.object@dr[[i]]@gene.loadings.full <- object@dr[[i]]@x.full + new.object@dr[[i]]@sdev <- object@dr[[i]]@sdev + new.object@dr[[i]]@key <- object@dr[[i]]@key + new.object@dr[[i]]@misc <- object@dr[[i]]@misc + } + # Conversion from release versions prior to 2.0.0 + # Slots to replace: pca.x, pca.rot, pca.x.full, tsne.rot, ica.rot, ica.x, + # tsne.rot + else{ + pca.sdev <- object@pca.obj[[1]]$sdev + if (is.null(x = pca.sdev)) { + pca.sdev <- object@pca.obj[[1]]$d + } + pca.obj <- new( + Class = "dim.reduction", + gene.loadings = as.matrix(object@pca.x), + gene.loadings.full = as.matrix(object@pca.x.full), + cell.embeddings = as.matrix(object@pca.rot), + sdev = pca.sdev, + key = "PC" + ) + new.object@dr$pca <- pca.obj + ica.obj <- new( + Class = "dim.reduction", + gene.loadings = as.matrix(object@ica.x), + cell.embeddings = as.matrix(object@ica.rot), + key = "IC" + ) + new.object@dr$ica <- ica.obj + tsne.obj <- new( + Class = "dim.reduction", + cell.embeddings = as.matrix(object@tsne.rot), + key = "tSNE_" + ) + new.object@dr$tsne <- tsne.obj + } + + if (length(x = object@snn.sparse) == 1 && length(x = object@snn.dense) > 1) { + if (class(object@snn.dense) == "data.frame") { + object@snn.dense <- as.matrix(x = object@snn.dense) + } + new.object@snn <- as(object = object@snn.dense, Class = "dgCMatrix") + } + else{ + new.object@snn <- object@snn.sparse + } + + return(new.object) +} + +#' Return a subset of rows for a matrix or data frame +#' +#' @param data Matrix or data frame with row names +#' @param code Pattern for matching within row names +#' @param invert Invert the search? +#' +#' @return Returns a subset of data. If invert = TRUE, returns data where rownames +#' do not contain code, otherwise returns data where rownames contain code +#' +#' @export +#' +SubsetRow <- function(data, code, invert = FALSE) { + return(data[grep(pattern = code, x = rownames(x = data), invert = invert), ]) +} + +#' Independently shuffle values within each row of a matrix +#' +#' Creates a matrix where correlation structure has been removed, but overall values are the same +#' +#' @param x Matrix to shuffle +#' @return Returns a scrambled matrix, where each row is shuffled independently +#' @export +MatrixRowShuffle <- function(x) { + x2 <- x + x2 <- t(x = x) + ind <- order(c(col(x = x2)), runif(n = length(x = x2))) + x2 <- matrix( + data = x2[ind], + nrow = nrow(x = x), + ncol = ncol(x = x), + byrow = TRUE + ) + return(x2) +} + +#' Return a subset of columns for a matrix or data frame +#' +#' @param data Matrix or data frame with column names +#' @param code Pattern for matching within column names +#' @param invert Invert the search? +#' +#' @return Returns a subset of data. If invert = TRUE, returns data where colnames +#' do not contain code, otherwise returns data where colnames contain code +#' +#' @export +#' +SubsetColumn <- function(data, code, invert = FALSE) { + return(data[, grep(pattern = code, x = colnames(x = data), invert = invert)]) +} + +#' Apply a ceiling and floor to all values in a matrix +#' +#' @param data Matrix or data frame +#' @param min all values below this min value will be replaced with min +#' @param max all values above this max value will be replaced with max +#' @return Returns matrix after performing these floor and ceil operations +#' @export +MinMax <- function(data, min, max) { + data2 <- data + data2[data2 > max] <- max + data2[data2 < min] <- min + return(data2) +} + +#' Extract delimiter information from a string. +#' +#' Parses a string (usually a cell name) and extracts fields based on a delimiter +#' +#' @param string String to parse. +#' @param field Integer(s) indicating which field(s) to extract. Can be a vector multiple numbers. +#' @param delim Delimiter to use, set to underscore by default. +#' +#' @return A new string, that parses out the requested fields, and (if multiple), rejoins them with the same delimiter +#' @export +ExtractField <- function(string, field = 1, delim = "_") { + fields <- as.numeric(x = unlist(x = strsplit(x = as.character(x = field), split = ","))) + if (length(x = fields) == 1) { + return(strsplit(x = string, split = delim)[[1]][field]) + } + return(paste(strsplit(x = string, split = delim)[[1]][fields], collapse = delim)) +} + +#' Calculate the variance of logged values +#' +#' Calculate variance of logged values in non-log space (return answer in +#' log-space) +#' +#' @param x value or vector of values +#' +#' @return Returns the variance in log-space +#' @export +ExpVar <- function(x) { + return(log1p(var(expm1(x)))) +} + +#' Calculate the standard deviation of logged values +#' +#' Calculate SD of logged values in non-log space (return answer in log-space) +#' +#' @param x value or vector of values +#' +#' @return Returns the standard deviation in log-space +#' @export +ExpSD <- function(x) { + return(log1p(sd(expm1(x)))) +} + +#' Calculate the mean of logged values +#' +#' Calculate mean of logged values in non-log space (return answer in log-space) +#' +#' @param x value or vector of values +#' +#' @return Returns the mean in log-space +#' @export +ExpMean <- function(x) { + return(log(x = mean(x = exp(x = x) - 1) + 1)) +} + +#' Calculate the variance to mean ratio of logged values +#' +#' Calculate the variance to mean ratio (VMR) in non-logspace (return answer in +#' log-space) +#' +#' @param x value or vector of values +#' +#' @return Returns the VMR in log-space +#' @export +LogVMR <- function(x) { + return(log(x = var(x = exp(x = x) - 1) / mean(x = exp(x = x) - 1))) +} + +#' Run a custom distance function on an input data matrix +#' +#' @author Jean Fan +#' +#' @param my.mat A matrix to calculate distance on +#' @param my.function A function to calculate distance +#' @param ... Extra parameters to my.function +#' +#' @return A distance matrix +#' +#' @export +#' +CustomDistance <- function(my.mat, my.function, ...) { + n <- ncol(x = my.mat) + mat <- matrix(data = 0, ncol = n, nrow = n) + colnames(x = mat) <- rownames(x = mat) <- colnames(x = my.mat) + for (i in 1:nrow(x = mat)) { + for(j in 1:ncol(x = mat)) { + mat[i,j] <- my.function(my.mat[, i], my.mat[, j], ...) + } + } + return(as.dist(mat)) +} + +#' Probability of detection by identity class +#' +#' For each gene, calculates the probability of detection for each identity +#' class. +#' +#' @param object Seurat object +#' @param thresh.min Minimum threshold to define 'detected' (log-scale) +#' +#' @return Returns a matrix with genes as rows, identity classes as columns. +#' +#' @export +#' +AverageDetectionRate <- function(object, thresh.min = 0) { + ident.use <- object@ident + data.all <- data.frame(row.names = rownames(x = object@data)) + for (i in sort(x = unique(x = ident.use))) { + temp.cells <- WhichCells(object = object, ident = i) + data.temp <- apply( + X = object@data[, temp.cells], + MARGIN = 1, + FUN = function(x) { + return(sum(x > thresh.min)/length(x = x)) + } + ) + data.all <- cbind(data.all, data.temp) + colnames(x = data.all)[ncol(x = data.all)] <- i + } + colnames(x = data.all) <- sort(x = unique(x = ident.use)) + return(data.all) +} + +#' Average PCA scores by identity class +#' +#' Returns the PCA scores for an 'average' single cell in each identity class +#' +#' @param object Seurat object +#' +#' @return Returns a matrix with genes as rows, identity classes as columns +#' +#' @export +#' +AveragePCA <- function(object) { + ident.use <- object@ident + data.all <- data.frame(row.names = colnames(x = object@pca.rot)) + for (i in levels(x = ident.use)) { + temp.cells <- WhichCells(object = object, ident = i) + if (length(x = temp.cells) == 1) { + data.temp <- apply( + X = data.frame((object@pca.rot[c(temp.cells, temp.cells), ])), + MARGIN = 2, + FUN = mean + ) + } + if (length(x = temp.cells) > 1) { + data.temp <- apply( + X = object@pca.rot[temp.cells, ], + MARGIN = 2, + FUN = mean + ) + } + data.all <- cbind(data.all, data.temp) + colnames(x = data.all)[ncol(x = data.all)] <- i + } + return((data.all)) +} + +#' Averaged gene expression by identity class +#' +#' Returns gene expression for an 'average' single cell in each identity class +#' +#' Output is in log-space, but averaging is done in non-log space. +#' +#' @param object Seurat object +#' @param genes.use Genes to analyze. Default is all genes. +#' @param return.seurat Whether to return the data as a Seurat object. Default is false. +#' @param add.ident Place an additional label on each cell prior to averaging (very useful if you want to observe cluster averages, separated by replicate, for example). +#' @param use.scale Use scaled values for gene expression +#' @param use.raw Use raw values for gene expression +#' @inheritParams FetchData +#' @param show.progress Show progress bar (default is T) +#' @param ... Arguments to be passed to methods such as \code{\link{Seurat}} +#' @return Returns a matrix with genes as rows, identity classes as columns. +#' @export +AverageExpression <- function( + object, + genes.use = NULL, + return.seurat = FALSE, + add.ident = NULL, + use.scale = FALSE, + use.raw = FALSE, + show.progress = TRUE, + ... +) { + + ident.orig <- object@ident + orig.levels <- levels(x = object@ident) + ident.new <- c() + if (! is.null(x = add.ident)) { + new.data <- FetchData(object = object, vars.all = add.ident) + new.ident <- paste( + object@ident[rownames(x = new.data)], + new.data[, 1], + sep = '_' + ) + object <- SetIdent( + object = object, + cells.use = rownames(x = new.data), + ident.use = new.ident + ) + } + if (return.seurat) { + assays.use <- c("RNA", names(x = object@assay)) + } else { + assays.use <- "RNA" + } + slot.use <- "data" + fxn.average <- ExpMean + if (show.progress) { + fxn.loop <- pbsapply + } else { + fxn.loop <- sapply + } + if (use.scale) { + slot.use <- "scale.data" + fxn.average <- mean + } + if (use.raw) { + slot.use <- "raw.data" + fxn.average <- mean + } + data.return <- list() + for (i in 1:length(x = assays.use)) { + data.use <- GetAssayData( + object = object, + assay.type = assays.use[i], + slot = slot.use + ) + genes.assay <- genes.use + if (length(x = intersect(x = genes.use,y = rownames(x = data.use))) <1 ) { + genes.assay <- rownames(x = data.use) + } + data.all <- data.frame(row.names = genes.assay) + for (j in levels(x = object@ident)) { + temp.cells <- WhichCells(object = object, ident = j) + genes.assay <- unique(x = intersect(x = genes.assay, y = rownames(x = data.use))) + if (length(x = temp.cells) == 1) { + data.temp <- (data.use[genes.assay, temp.cells]) + } + if (length(x = temp.cells) >1 ) { + data.temp <- apply( + X = data.use[genes.assay, temp.cells], + MARGIN = 1, + FUN = fxn.average + ) + } + data.all <- cbind(data.all, data.temp) + colnames(x = data.all)[ncol(x = data.all)] <- j + if (show.progress) { + print(paste0("Finished averaging ", assays.use[i], " for cluster ", j)) + } + if(i == 1) { + ident.new <- c(ident.new, as.character(x = ident.orig[temp.cells[1]])) + } + } + names(x = ident.new) <- levels(x = object@ident) + data.return[[i]] <- data.all + names(x = data.return)[i] <- assays.use[[i]] + } + if (return.seurat) { + toRet <- CreateSeuratObject( + raw.data = data.return[[1]], + project = "Average", + min.cells = 0, + min.genes = 0, + is.expr = 0, + ... + ) + #for multimodal data + if (length(x = data.return) > 1) { + for (i in 2:length(x = data.return)) { + toRet <- SetAssayData( + object = toRet, + assay.type = names(x = data.return)[i], + slot = "raw.data", + new.data = data.return[[i]] + ) + } + } + toRet <- SetIdent( + object = toRet, + cells.use = toRet@cell.names, + ident.use = ident.new[toRet@cell.names] + ) + toRet@ident <- factor( + x = toRet@ident, + levels = as.character(x = orig.levels), + ordered = TRUE + ) + return(toRet) + } else { + return(data.return[[1]]) + } +} + + +#' Merge childen of a node +#' +#' Merge the childen of a node into a single identity class +#' +#' @param object Seurat object +#' @param node.use Merge children of this node +#' +#' @export +#' +MergeNode <- function(object, node.use = NULL) { + object.tree <- object@cluster.tree[[1]] + node.children <- DFT( + tree = object.tree, + node = node.use, + include.children = TRUE + ) + node.children <- intersect(x = node.children, y = levels(x = object@ident)) + children.cells <- WhichCells(object = object, ident = node.children) + if (length(x = children.cells > 0)) { + object <- SetIdent( + object = object, + cells.use = children.cells, + ident.use = min(node.children) + ) + } + return(object) +} + + + + +#' Calculate smoothed expression values +#' +#' +#' Smooths expression values across the k-nearest neighbors based on dimensional reduction +#' +#' @inheritParams FeaturePlot +#' @inheritParams AddImputedScore +#' @param genes.fit Genes to calculate smoothed values for +#' @param k k-param for k-nearest neighbor calculation +#' @param do.log Whether to perform smoothing in log space. Default is false. +#' +#' @importFrom FNN get.knn +#' +#' @export +#' +AddSmoothedScore <- function( + object, + genes.fit = NULL, + dim.1 = 1, + dim.2 = 2, + reduction.use = "tSNE", + k = 30, + do.log = FALSE, + do.print = FALSE +) { + genes.fit <- SetIfNull(x = genes.fit, default = object@var.genes) + genes.fit <- genes.fit[genes.fit %in% rownames(x = object@data)] + dim.code <- GetDimReduction( + object = object, + reduction.type = reduction.use, + slot = 'key' + ) + dim.codes <- paste0(dim.code, c(dim.1, dim.2)) + data.plot <- FetchData(object = object, vars.all = dim.codes) + knn.smooth <- get.knn(data = data.plot, k = k)$nn.index + avg.fxn <- mean + if (! do.log) { + avg.fxn <- ExpMean + } + lasso.fits <- data.frame( + t( + x = sapply( + X = genes.fit, + FUN = function(g) { + return(unlist( + x = lapply( + X = 1:nrow(x = data.plot), + FUN = function(y) { + avg.fxn(as.numeric(x = object@data[g, knn.smooth[y, ]])) + } + ) + )) + } + ) + ) + ) + colnames(x = lasso.fits) <- rownames(x = data.plot) + genes.old <- genes.fit[genes.fit %in% rownames(x = object@imputed)] + genes.new <- genes.fit[! (genes.fit %in% rownames(x = object@imputed))] + if (length(x = genes.old) > 0) { + object@imputed[genes.old, ] <- lasso.fits[genes.old, ] + } + object@imputed <- rbind(object@imputed, lasso.fits[genes.new, ]) + return(object) +} + +#' Calculate imputed expression values +#' +#' Uses L1-constrained linear models (LASSO) to impute single cell gene +#' expression values. +#' +#' +#' @param object Seurat object +#' @param genes.use A vector of genes (predictors) that can be used for +#' building the LASSO models. +#' @param genes.fit A vector of genes to impute values for +#' @param s.use Maximum number of steps taken by the algorithm (lower values +#' indicate a greater degree of smoothing) +#' @param do.print Print progress (output the name of each gene after it has +#' been imputed). +#' @param gram The use.gram argument passed to lars +#' +#' @return Returns a Seurat object where the imputed values have been added to +#' object@@imputed +#' +#' @import lars +#' +#' @export +#' +AddImputedScore <- function( + object, + genes.use = NULL, + genes.fit = NULL, + s.use = 20, + do.print = FALSE, + gram = TRUE +) { + genes.use <- SetIfNull(x = genes.use, default = object@var.genes) + genes.fit <- SetIfNull(x = genes.fit, default = object@var.genes) + genes.use <- genes.use[genes.use %in% rownames(x = object@data)] + genes.fit <- genes.fit[genes.fit %in% rownames(x = object@data)] + lasso.input <- t(x = object@data[genes.use, ]) + lasso.fits <- data.frame(t( + x = sapply( + X = genes.fit, + FUN = function(x) { + return( + lasso.fxn( + lasso.input = t(x = object@data[genes.use[genes.use != x], ]), + genes.obs = object@data[x, ], + s.use = s.use, + gene.name = x, + do.print = do.print, + gram = gram + ) + ) + } + ) + )) + genes.old <- genes.fit[genes.fit %in% rownames(x = object@imputed)] + genes.new <- genes.fit[! (genes.fit %in% rownames(x = object@imputed))] + if (length(x = genes.old) > 0) { + object@imputed[genes.old, ] <- lasso.fits[genes.old, ] + } + object@imputed <- rbind(object@imputed, lasso.fits[genes.new, ]) + return(object) +} + +#' GenesInCluster +#' +#' After k-means analysis, previously run with DoKMeans, returns a set of genes associated with each cluster +#' +#' @param object Seurat object. Assumes DoKMeans has already been run +#' @param cluster.num K-means cluster(s) to return genes for +#' @param max.genes max number of genes to return +#' @return A vector of genes who are members in the cluster.num k-means cluster(s) +#' +#' @export +GenesInCluster <- function(object, cluster.num, max.genes = 1e6) { + toReturn <- unlist( + x = lapply( + X = cluster.num, + FUN = function(x) { + return(head( + x = sort(x = names(x = which(x = object@kmeans@gene.kmeans.obj$cluster==x))), + n = max.genes + )) + } + ) + ) + return(toReturn) +} + + diff --git a/R/utilities_internal.R b/R/utilities_internal.R new file mode 100644 index 000000000..86ef20ea5 --- /dev/null +++ b/R/utilities_internal.R @@ -0,0 +1,563 @@ +# Internal function for merging two matrices by rowname +# +# @param mat1 First matrix +# @param mat2 Second matrix +# +# @return A merged matrix +# +RowMergeSparseMatrices <- function(mat1, mat2){ + if (inherits(x = mat1, what = "data.frame")) { + mat1 <- as.matrix(x = mat1) + } + if (inherits(x = mat2, what = "data.frame")) { + mat2 <- as.matrix(x = mat2) + } + mat1 <- as(object = mat1, Class = "RsparseMatrix") + mat2 <- as(object = mat2, Class = "RsparseMatrix") + mat1.names <- rownames(x = mat1) + mat2.names <- rownames(x = mat2) + all.names <- union(x = mat1.names, y = mat2.names) + new.mat <- RowMergeMatrices( + mat1 = mat1, + mat2 = mat2, + mat1_rownames = mat1.names, + mat2_rownames = mat2.names, + all_rownames = all.names + ) + rownames(x = new.mat) <- make.unique(names = all.names) + colnames(x = new.mat) <- make.unique(names = c( + colnames(x = mat1), + colnames(x = mat2) + )) + return(new.mat) +} + +# Calculate the percentage of a vector above some threshold +# +# @param x Vector of values +# @param threshold Threshold to use when calculating percentage +# +# @return Returns the percentage of `x` values above the given +# threshold +# +PercentAbove <- function(x, threshold){ + return(length(x = x[x > threshold]) / length(x = x)) +} + +# Calculate position along a defined reference range for a given vector of +# numerics. Will range from 0 to 1. +# +# @param x Vector of numeric type +# @param lower Lower end of reference range +# @param upper Upper end of reference range +# +# @return Returns a vector that describes the position of each element in +# x along the defined reference range + +ReferenceRange <- function(x, lower = 0.025, upper = 0.975) { + return((x - quantile(x = x, probs = lower)) / + (quantile(x = x, probs = upper) - quantile(x = x, probs = lower))) +} + +# Function to map values in a vector `v` as defined in `from`` to the values +# defined in `to`. +# +# @param v vector of values to map +# @param from vector of original values +# @param to vector of values to map original values to (should be of equal +# length as from) +# @return returns vector of mapped values +# +MapVals <- function(v, from, to){ + if (length(from) != length(to)) { + stop("from and to vectors are not the equal length.") + } + vals.to.match <- match(v, from) + vals.to.match.idx <- !is.na(vals.to.match) + v[vals.to.match.idx] <- to[vals.to.match[vals.to.match.idx]] + return(v) +} + +# Fills slot in new object with equivalent slot in old object if it still exists +# +# @param slot.name slot to fill +# @param old.object object to get slot value from +# @param new.slot object to set slot value in +# +# @return returns new object with slot filled +# +FillSlot <- function(slot.name, old.object, new.object){ + new.slot <- tryCatch( + { + slot(object = old.object, name = slot.name) + }, + error = function(err){ + return(NULL) + } + ) + if(!is.null(x = new.slot)) { + slot(new.object, slot.name) <- new.slot + } + return(new.object) +} + + +# Use Fisher's method (Fisher's combined probability test) to combine p-values +# into single statistic +# +# @param pvals vector of p-values +# +# @returns integrated value +# +FisherIntegrate <- function(pvals) { + return(1 - pchisq(q = -2 * sum(log(x = pvals)), df = 2 * length(x = pvals))) +} + +# Set CalcParam information +# +# @param object A Seurat object +# @param calculation The name of the calculation that was done +# @param time store time of calculation as well +# @param ... Parameters for the calculation +# +# @return object with the calc.param slot modified to either append this +# calculation or replace the previous instance of calculation with +# a new list of parameters +# +SetCalcParams <- function(object, calculation, time = TRUE, ...) { + object@calc.params[calculation] <- list(...) + object@calc.params[[calculation]]$object <- NULL + object@calc.params[[calculation]]$object2 <- NULL + if(time) { + object@calc.params[[calculation]]$time <- Sys.time() + } + return(object) +} + +# Delete CalcParam information +# +# @param object A Seurat object +# @param calculation The name of the calculation to remove +# +# @return object with the calc.param slot modified to remove this +# calculation +# +RemoveCalcParams <- function(object, calculation){ + object@calc.params[calculation] <- NULL + return(object) +} + +# Set Single CalcParam information +# +# @param object A Seurat object +# @param calculation The name of the calculation that was done +# @param parameter Parameter for the calculation to set +# @param value Value of parameter to set +# +# @return object with the calc.param slot modified to either append this +# calculation or replace the previous instance of calculation with +# a new list of parameters +# +SetSingleCalcParam <- function(object, calculation, parameter, value) { + object@calc.params[[calculation]][parameter] <- value + return(object) +} + +# Get CalcParam information +# +# @param object A Seurat object +# @param calculation The name of the calculation that was done +# @param parameter Parameter for the calculation to pull +# +# @return parameter value for given calculation +# +GetCalcParam <- function(object, calculation, parameter){ + if(parameter == "time"){ + return(object@calc.params[[calculation]][parameter][[1]]) + } + return(unname(unlist(object@calc.params[[calculation]][parameter]))) +} + +# Get All CalcParam information for given calculation +# +# @param object A Seurat object +# @param calculation The name of the calculation that was done +# +# @return list of parameter values for given calculation +# +GetAllCalcParam <- function(object, calculation){ + return(object@calc.params[[calculation]]) +} + +# Has any info been stored for the given calculation? +# +# @param object A Seurat object +# @param calculation The name of the calculation to look for info about +# +# @return Returns a boolean - whether or not there is any info about given calc +# stored +# +CalcInfoExists <- function(object, calculation){ + return(!is.null(object@calc.params[[calculation]])) +} + +# Return vector of whitespace +# +# @param n length of whitespace vector to return +# +# @return vector of whitespace +# +FillWhiteSpace <- function(n){ + if(n <= 0){ + n <- 1 + } + return(paste0(rep(" ", n), collapse = "")) +} + +####################### Tree Related Utilities ################################# + +# Function to get all the descendants on a tree left of a given node +# +# @param tree Tree object (from ape package) +# @param node Internal node in the tree +# +# @return Returns all descendants left of the given node +# +GetLeftDescendants <- function(tree, node) { + daughters <- tree$edge[which(tree$edge[, 1] == node), 2] + if (daughters[1] <= (tree$Nnode+1)) { + return(daughters[1]) + } + daughter.use <- GetDescendants(tree, daughters[1]) + daughter.use <- daughter.use[daughter.use <= (tree$Nnode + 1)] + return(daughter.use) +} + +# Function to get all the descendants on a tree right of a given node +# +# @param tree Tree object (from ape package) +# @param node Internal node in the tree +# +# @return Returns all descendants right of the given node +# +GetRightDescendants <- function(tree, node) { + daughters <- tree$edge[which(x = tree$edge[, 1] == node), 2] + if (daughters[2] <= (tree$Nnode + 1)) { + return(daughters[2]) + } + daughter.use <- GetDescendants(tree = tree, node = daughters[2]) + daughter.use <- daughter.use[daughter.use <= (tree$Nnode + 1)] + return(daughter.use) +} + +# Function to get all the descendants on a tree of a given node +# +# @param tree Tree object (from ape package) +# @param node Internal node in the tree +# +# @return Returns all descendants of the given node +# +GetDescendants <- function(tree, node, curr = NULL) { + if (is.null(x = curr)) { + curr <- vector() + } + daughters <- tree$edge[which(x = tree$edge[, 1] == node), 2] + curr <- c(curr, daughters) + w <- which(x = daughters >= length(x = tree$tip)) + if (length(x = w) > 0) { + for (i in 1:length(x = w)) { + curr <- GetDescendants(tree = tree, node = daughters[w[i]], curr = curr) + } + } + return(curr) +} + +# Depth first traversal path of a given tree +# +# @param tree Tree object (from ape package) +# @param node Internal node in the tree +# @param path Path through the tree (for recursion) +# @param include.children Include children in the output path +# @param only.children Only include children in the output path +# @return Returns a vector representing the depth first +# traversal path +# +DFT <- function( + tree, + node, + path = NULL, + include.children = FALSE, + only.children = FALSE +) { + if (only.children) { + include.children = TRUE + } + children <- which(x = tree$edge[, 1] == node) + child1 <- tree$edge[children[1], 2] + child2 <- tree$edge[children[2], 2] + if (child1 %in% tree$edge[, 1]) { + if(! only.children){ + path <- c(path, child1) + } + path <- DFT( + tree = tree, + node = child1, + path = path, + include.children = include.children, + only.children = only.children + ) + } else { + if (include.children) { + path <-c(path, child1) + } + } + if (child2 %in% tree$edge[, 1]) { + if (! only.children) { + path <- c(path, child2) + } + path <- DFT( + tree = tree, + node = child2, + path = path, + include.children = include.children, + only.children = only.children + ) + } else { + if (include.children) { + path <- c(path, child2) + } + } + return(path) +} + +# Function to check whether a given node in a tree has a child (leaf node) +# +# @param tree Tree object (from ape package) +# @param node Internal node in the tree +# +# @return Returns a Boolean of whether the given node is connected to a +# terminal leaf node + +NodeHasChild <- function(tree, node) { + children <- tree$edge[which(x = tree$edge[, 1] == node), ][, 2] + return(any(children %in% tree$edge[, 2] && ! children %in% tree$edge[, 1])) +} + +# Function to check whether a given node in a tree has only children(leaf nodes) +# +# @param tree Tree object (from ape package) +# @param node Internal node in the tree +# +# @return Returns a Boolean of whether the given node is connected to only +# terminal leaf nodes + +NodeHasOnlyChildren <- function(tree, node) { + children <- tree$edge[which(x = tree$edge[, 1] == node), ][, 2] + return(! any(children %in% tree$edge[, 1])) +} + +# Function to return all internal (non-terminal) nodes in a given tree +# +# @param tree Tree object (from ape package) +# +# @return Returns a vector of all internal nodes for the given tree +# +GetAllInternalNodes <- function(tree) { + return(c(tree$edge[1, 1], DFT(tree = tree, node = tree$edge[1, 1]))) +} +################################################################################ + +# Weighted Euclidean Distance +# +# @param x Dataset 1 +# @param y Dataset 2 +# @param w Weights +# +# @return The Weighted Euclidian Distance (numeric) +# +WeightedEuclideanDistance <- function(x, y, w) { + v.dist <- sum(sqrt(x = w * (x - y) ^ 2)) + return(v.dist) +} + +# Set a default value if an object is null +# +# @param x An object to set if it's null +# @param default The value to provide if x is null +# +# @return default if x is null, else x +# +SetIfNull <- function(x, default) { + if(is.null(x = x)){ + return(default) + } else { + return(x) + } +} + +# return average of all values greater than a threshold +# +# @param x Values +# @param min Minimum threshold +# +# @return The mean of x where x > min +# +MeanGreaterThan <- function(x, min = 0) { + return(mean(x = x[x > min])) +} + +# return variance of all values greater than a threshold +# +# @param x Values +# @param min Minimum threshold +# +# @return The variance of x where x > min +# +VarianceGreaterThan <- function(x, min = 0) { + return(var(x = x[x > min])) +} + +# calculate the coefficient of variation +# +# @param x Values to calculate the coefficient of variation +# +# @return The coefficient of variation of x +# +CoefVar <- function(x) { + return(sd(x = x) / mean(x = x)) +} + +# return la count of all values greater than a threshold +# +# @param x Values +# @param min Minimum threshold +# +# @return The length of x where x > min +# +CountGreaterThan <- function(x, min = 0) { + return(sum(x > min)) +} + +# add values in log-space +# +# @param x Values +# +# @return values added in log space +# +LogAdd <- function(x) { + mpi <- max(x) + return(mpi + log(x = sum(exp(x = x - mpi)))) +} + +# Return what was passed +# +# @param x anything +# +# @return Returns x +# +Same <- function(x) { + return(x) +} + +NBResiduals <- function(fmla, regression.mat, gene) { + fit <- 0 + try(expr = fit <- glm.nb(formula = fmla, data = regression.mat), silent=TRUE) + if (class(fit)[1] == 'numeric') { + message(sprintf( + 'glm.nb failed for gene %s; trying again with glm and family=negative.binomial(theta=0.1)', + gene + )) + try( + expr = fit <- glm( + formula = fmla, + data = regression.mat, + family = negative.binomial(theta = 0.1) + ), + silent = TRUE + ) + if (class(fit)[1] == 'numeric') { + message('glm and family=negative.binomial(theta=0.1) failed; falling back to scale(log10(y+1))') + return(scale(x = log10(x = regression.mat[, 'GENE'] + 1))[, 1]) + } + } + return(residuals(object = fit, type='pearson')) +} + + +# Documentation +############### +#Internal, not documented for now +lasso.fxn <- function( + lasso.input, + genes.obs, + s.use = 20, + gene.name = NULL, + do.print = FALSE, + gram = TRUE +) { + lasso.model <- lars( + x = lasso.input, + y = as.numeric(x = genes.obs), + type = "lasso", + max.steps = s.use * 2, + use.Gram = gram + ) + #lasso.fits=predict.lars(lasso.model,lasso.input,type="fit",s=min(s.use,max(lasso.model$df)))$fit + lasso.fits <- predict.lars( + object = lasso.model, + newx = lasso.input, + type = "fit", + s = s.use + )$fit + if (do.print) { + print(gene.name) + } + return(lasso.fits) +} + +# Calculate the biweight midcorrelation (bicor) of two vectors using +# implementation described in Langfelder, J Stat Sotfw. 2012. If MAD of one of +# the two vectors is 0, falls back on robust standardization. +# +# @author Patrick Roelli +# @param x First vector +# @param y Second vector +# +# @return returns the biweight midcorrelation of x and y +# +BiweightMidcor <- function(x, y){ + resx <- BicorPrep(x) + resy <- BicorPrep(y) + result <- sum(resx * resy) + return(result) +} + +# bicor helper function to standardize the two vectors and perform common +# calculations. +# +# @author Patrick Roelli +# @param x Vector to prep +# @param verbose If TRUE, prints a warning when falling back on robust +# standardization when MAD(x) is 0. +# +# @return returns the prepped vector +# +BicorPrep <- function(x, verbose = FALSE){ + if(mad(x) == 0) { + if(verbose){ + warning('mad == 0, using robust standardization') + } + xat <- x - mean(x) + xab <- sqrt(sum((x - mean(x)) ^ 2)) + result <- xat / xab + return(result) + }else{ + ua <- (x - median(x)) / (9 * mad(x) * qnorm(0.75)) + i.x <- ifelse(ua <= -1 | ua >= 1, 0, 1) + wax <- ((1 - (ua^2))^2) * i.x + xat <- (x - median(x)) * wax + xab <- sqrt(sum(xat^2)) + result <- xat / xab + return(result) + } +} diff --git a/R/zfRenderSeurat.R b/R/zfRenderSeurat.R new file mode 100644 index 000000000..7f82d9ac8 --- /dev/null +++ b/R/zfRenderSeurat.R @@ -0,0 +1,669 @@ + +#' Draw 3D in situ predictions from Zebrafish dataset +#' +#' From Jeff Farrell +#' +#' @param data Predicted expression levels across Zebrafish bins +#' @param label Plot label +#' +#' @export +situ3d <- function(data, label = NULL, ...) { + # Call Seurat function to get the in situ values out. + exp.1 <- data + exp.1 <- (exp.1 - min(exp.1)) / (max(exp.1) - min(exp.1)) + # Reformat them into an expression matrix as expected by the plotting function + expression.matrix <- data.frame(matrix(data = exp.1, nrow = 8, ncol = 8)) + rownames(x = expression.matrix) <- c( + "24-30", + "17-23", + "13-16", + "9-12", + "7-8", + "5-6", + "3-4", + "1-2" + ) + names(x = expression.matrix) <- c( + "1-4", + "5-8", + "9-12", + "13-16", + "17-20", + "21-24", + "25-28", + "29-32" + ) + # Call the plotting function. + zf.insitu.side(expression.matrix) + par3d(windowRect = c(0, 0, 800, 800)) + # Label or not and then set the view. + if (! is.null(x = label)) { + text3d(x = 0, y = 0, z = 1.5, text = label, cex = 3) + } + view3d(zoom = .75, theta = 0, phi = -90, fov = 0) +} + + +#' @export +zf.cells.render <- function( + seuratObject, + cells.use, + do.rotate = TRUE, + label = TRUE, + calc.new = FALSE, + col.use = "red", + radius.use = 0.05, + col.prob = FALSE, + do.new = TRUE, + ... +) { + tierBins <- 30 # 1 bin per cell tier. + DVBins <- 64 # 1 bin every 5.625 degrees; compatible with our current 8-bin system. + phiPerTier <- pi / (-2 * tierBins) + thetaPerDV <- (2 * pi) / DVBins + if (length(x = col.use) == 1) { + col.use <- rep(x = col.use, length(x = cells.use)) + } + # Reformat that probability into an expression matrix as expected by the plotting function + if (col.prob) { + prob.matrix <- data.frame(matrix( + data = apply( + X = seuratObject@final.prob[, cells.use], + MARGIN = 1, + FUN = sum + ), + nrow = 8, + ncol = 8 + )) + } else { + prob.matrix <- data.frame(matrix(data = 0, nrow = 8, ncol = 8)) + } + rownames(x = prob.matrix) <- c( + "24-30", + "17-23", + "13-16", + "9-12", + "7-8", + "5-6", + "3-4", + "1-2" + ) + names(x = prob.matrix) <- c( + "1-4", + "5-8", + "9-12", + "13-16", + "17-20", + "21-24", + "25-28", + "29-32" + ) + # Call the plotting function. + if (do.new) + { + zf.insitu.side( + expressionMatrix = prob.matrix, + mirror = TRUE, + nonmirror = FALSE + ) + } + i <- 1 + for(cell.use in cells.use) { + #add the centroid + anchor.centroid <- ExactCellCentroid( + cell.probs = seuratObject@final.prob[, cell.use] + ) + tiers.min <- c(30, 24, 16, 12, 8, 6, 4, 2, 0) + tiers.size <- diff(x = tiers.min) + anchor.dorsality <- DVBins - ((anchor.centroid[2] - 1) / 7) * DVBins / 2 + anchor.tier.bin <- anchor.centroid[1] + anchor.tier.bin <- anchor.centroid[1] + anchor.tier.floor <- floor(x = anchor.tier.bin) + anchor.tier.left <- anchor.tier.bin - anchor.tier.floor + anchor.tier <- tiers.min[anchor.tier.bin] + + (tiers.size[anchor.tier.floor] * anchor.tier.left) + x1 <- cos(x = pi - thetaPerDV * anchor.dorsality) * + sin(x = 0.5 * pi + phiPerTier * anchor.tier) + y1 <- sin(x = pi - thetaPerDV * anchor.dorsality) * + sin(0.5 * pi + phiPerTier * anchor.tier) + z1 <- cos(x = 0.5 * pi + phiPerTier * anchor.tier) + spheres3d( + x = x1, + y = y1, + z = z1, + radius = radius.use, + color = col.use[i], + alpha = .65, + lit = FALSE + ) + i <- i + 1; + } + view3d(zoom = .75, theta = 0, phi = -90, fov = 0) + # Format the plot + if (do.new) { + if (label) { + # text3d(x=0, y=0, z=1.5, text=paste(this.anchor, anchor.distance),cex=3) + text3d(x = 1.4, y = 0, z = -0.3, cex = 2.25, text = "Dor") + text3d(x = -1.4, y = 0, z = -0.3, cex = 2.25, text = "Ven") + text3d(x = 0, y = 0, z = 1.2, cex = 2.25, text = "An") + text3d(x = 0, y = 0, z = -1.2, cex = 2.25, text = "Veg") + } + #view3d(zoom=.75, theta=0, phi=-90, fov=0) # This makes you look at dorsality 48. + to.rotate <- (anchor.dorsality - 47.5) / 64 + if (to.rotate < 0) { + to.rotate <- 1 + to.rotate + } + if (do.rotate) { + play3d(spin3d(axis = c(0, 0, 1), rpm = 60), duration = to.rotate) + } + par3d(windowRect = c(0, 0, 800, 800)) + } +} + +#' @export +zf.anchor.render <- function( + seuratObject, + this.anchor, + anchors, + label = TRUE, + do.rotate = TRUE, + calc.new = FALSE, + ... +) { + # Determine geometry + tierBins <- 30 # 1 bin per cell tier. + DVBins <- 64 # 1 bin every 5.625 degrees; compatible with our current 8-bin system. + phiPerTier <- pi / (-2 * tierBins) + thetaPerDV <- 2 * pi / DVBins + cellColor <- "#EB008B" + centroidColor <- "green" + # Get probability of anchor being in each bin. + if (calc.new) { + anchor.prob <- data.frame( + prob = as.numeric(x = project.cell( + seuratObject, + this.anchor, + do.plot = FALSE, + safe = FALSE + )) + ) + } + if (!(calc.new)) { + anchor.prob.raw <- data.frame(prob = seuratObject@final.prob[, this.anchor]) + } + # Normalize so that strongest probability is 1. + anchor.prob <- round(x = anchor.prob.raw / max(anchor.prob.raw), digits = 5) + # Reformat that probability into an expression matrix as expected by the plotting function + prob.matrix <- data.frame(matrix(data = anchor.prob[, 1], nrow = 8, ncol = 8)) + rownames(x = prob.matrix) <- c( + "24-30", + "17-23", + "13-16", + "9-12", + "7-8", + "5-6", + "3-4", + "1-2" + ) + names(x = prob.matrix) <- c( + "1-4", + "5-8", + "9-12", + "13-16", + "17-20", + "21-24", + "25-28", + "29-32" + ) + # Call the plotting function. + zf.insitu.side(expressionMatrix = prob.matrix, mirror = TRUE, nonmirror = FALSE) + # Add the anchor cell + anchor.dorsality <- DVBins - anchors[this.anchor, "dorsality"] * DVBins / 2 + anchor.tier <- anchors[this.anchor, "tier"] + x <- cos(x = pi - thetaPerDV * anchor.dorsality) * + sin(x = 0.5 * pi + phiPerTier * anchor.tier) + y <- sin(x = pi - thetaPerDV * anchor.dorsality) * + sin(x = 0.5 * pi + phiPerTier * anchor.tier) + z <- cos(x = 0.5 * pi + phiPerTier * anchor.tier) + spheres3d( + x = x, + y = y, + z = z, + radius = 0.16, + color = cellColor, + alpha = .65, + lit = FALSE + ) + #add the centroid + anchor.centroid <- ExactCellCentroid(cell.probs = anchor.prob.raw) + tiers.min <- c(30, 24, 16, 12, 8, 6, 2, 1) + tiers.size <- diff(x = tiers.min) + anchor.dorsality <- DVBins - ((anchor.centroid[2] - 1) / 7) * DVBins / 2 + anchor.tier.bin <- anchor.centroid[1] + anchor.tier.bin <- anchor.centroid[1] + anchor.tier.floor <- floor(x = anchor.tier.bin) + anchor.tier.left <- anchor.tier.bin - anchor.tier.floor + anchor.tier <- tiers.min[anchor.tier.bin] + + (tiers.size[anchor.tier.floor] * anchor.tier.left) + x1 <- cos(x = pi - thetaPerDV * anchor.dorsality) * + sin(x = 0.5 * pi + phiPerTier * anchor.tier) + y1 <- sin(x = pi - thetaPerDV * anchor.dorsality) * + sin(x = 0.5 * pi + phiPerTier * anchor.tier) + z1 <- cos(x = 0.5 * pi + phiPerTier * anchor.tier) + spheres3d( + x = x1, + y = y1, + z = z1, + radius = 0.08, + color = centroidColor, + alpha =.65, + lit = FALSE + ) + #anchor.tier.true.bin=anchors[this.anchor,"tier.bin"] + #anchor.tier.true <- anchors[this.anchor, "tier"] + #tier.min.distance=anchor.tier.true-tiers.min[anchor.tier.true.bin] + #if (tier.min.distance > 0) { + # anchor.tier.true.bin=anchors[this.anchor,"tier.bin"]-tier.min.distance/(tiers.size[anchor.tier.true.bin]) + #} + anchor.distance <- round(x = dist(x = rbind(c(x, y), c(x1, y1))), digits = 2) + # Format the plot + if (label) { + text3d( + x = 0, + y = 0, + z = 1.5, + text = paste(this.anchor, anchor.distance), + cex=3 + ) + text3d(x = 1.4, y = 0, z = -0.3, cex = 2.25, text = "Dor") + text3d(x = -1.4, y = 0, z = -0.3, cex = 2.25, text = "Ven") + text3d(x = 0, y = 0, z = 1.2, cex = 2.25, text = "An") + text3d(x = 0, y = 0, z = -1.2, cex = 2.25, text = "Veg") + } + view3d(zoom = .75, theta = 0, phi = -90, fov = 0) # This makes you look at dorsality 48. + to.rotate <- (anchor.dorsality - 47.5) / 64 + if (to.rotate < 0) { + to.rotate <- 1 + to.rotate + } + if (do.rotate) { + play3d(spin3d(axis = c(0, 0, 1), rpm = 60), duration = to.rotate) + } + par3d(windowRect = c(0, 0, 800, 800)) +} + +zf.anchor.map <- function( + seuratObject, + this.anchor, + anchors, + calc.new = FALSE, + ... +) { + # Determine geometry + if (calc.new) { + anchor.prob <- (prob = as.numeric(x = project.cell( + seuratObject, + this.anchor, + do.plot = FALSE, + safe = FALSE + ))) + } + if (!(calc.new)) { + anchor.prob.raw <- (prob = seuratObject@final.prob[, this.anchor]) + } + # Normalize so that strongest probability is 1. + anchor.prob <- round(x = anchor.prob.raw / max(anchor.prob.raw), digits = 5) + hm4( + matrix(data = as.numeric(x = anchor.prob), nrow = 8), + trace = "none", + Rowv = NA, + Colv = NA + ) + text( + x = anchors[this.anchor, "dv.bin"], + y = 9-anchors[this.anchor, "tier.bin"], + labels = "X", + cex = 1.5 + ) + text(x = 5, y = 1, labels = this.anchor) + # Reformat that probability into an expression matrix as expected by the plotting function +} + +#' @export +zf.insitu.vec.lateral <- function( + expression.vector, + label = TRUE, + title = NULL, + ... +) { + # Reformat them into an expression matrix as expected by the plotting function + expression.matrix <- data.frame(matrix( + data = expression.vector, + nrow = 8, + ncol = 8 + )) + rownames(x = expression.matrix) <- c( + "24-30", + "17-23", + "13-16", + "9-12", + "7-8", + "5-6", + "3-4", + "1-2" + ) + names(x = expression.matrix) <- c( + "1-4", + "5-8", + "9-12", + "13-16", + "17-20", + "21-24", + "25-28", + "29-32" + ) + # Call the plotting function. + zf.insitu.side(expressionMatrix = expression.matrix, ...) + par3d(windowRect = c(0, 0, 800, 800)) + # Label or not and then set the view. + if (! is.null(x = title) & ! label) { + text3d(x = 0, y = 0, z = 1.2, text = title, cex = 4.5) + } + if (! is.null(x = title) & label) { + text3d(x = 0, y = 0, z = 1.5, text = title, cex = 3) + } + if (label) { + text3d(x = 1.4, y = 0, z = -0.3, cex = 2.25, text = "Dor") + text3d(x = -1.4, y = 0, z = -0.3, cex = 2.25, text = "Ven") + text3d(x = 0, y = 0, z = 1.2, cex = 2.25, text = "An") + text3d(x = 0, y = 0, z = -1.2, cex = 2.25, text = "Veg") + } + view3d(zoom = .75, theta = 0, phi = -90, fov = 0) +} + +#' @export +zf.insitu.lateral <- function(seuratObject, gene, label = TRUE, ...) { + # Call Seurat function to get the in situ values out. + expression <- CalcInsitu( + seuratObject, + gene, + do.plot = FALSE, + do.return = TRUE, + do.norm = TRUE, + ... + ) + # Reformat them into an expression matrix as expected by the plotting function + expression.matrix <- data.frame(matrix(data = expression, nrow = 8, ncol = 8)) + rownames(x = expression.matrix) <- c( + "24-30", + "17-23", + "13-16", + "9-12", + "7-8", + "5-6", + "3-4", + "1-2" + ) + names(x = expression.matrix) <- c( + "1-4", + "5-8", + "9-12", + "13-16", + "17-20", + "21-24", + "25-28", + "29-32" + ) + # Call the plotting function. + zf.insitu.side(expressionMatrix = expression.matrix) + par3d(windowRect = c(0, 0, 800, 800)) + # Label or not and then set the view. + text3d(x = 0, y = 0, z = 1.5, text = gene, cex = 3) + if (label) { + text3d(x = 1.4, y = 0, z = -0.3, cex = 2.25, text = "Dor") + text3d(x = -1.4, y = 0, z = -0.3, cex = 2.25, text = "Ven") + text3d(x = 0, y = 0, z = 1.2, cex = 2.25, text = "An") + text3d(x = 0, y = 0, z = -1.2, cex = 2.25, text = "Veg") + } + view3d(zoom = .75, theta = 0, phi = -90, fov = 0) +} + +#' @export +zf.insitu.dorsal <- function(seuratObject, gene, label=TRUE, ...) { + # Call Seurat function to get the in situ values out. + expression <- CalcInsitu( + seuratObject, + gene, + do.plot = FALSE, + do.return = TRUE, + do.norm = TRUE, + ... + ) + # Reformat them into an expression matrix as expected by the plotting function + expression.matrix <- data.frame(matrix(data = expression, nrow = 8, ncol = 8)) + rownames(x = expression.matrix) <- c( + "24-30", + "17-23", + "13-16", + "9-12", + "7-8", + "5-6", + "3-4", + "1-2" + ) + names(x = expression.matrix) <- c( + "1-4", + "5-8", + "9-12", + "13-16", + "17-20", + "21-24", + "25-28", + "29-32" + ) + # Call the plotting function. + zf.insitu.side(expressionMatrix = expression.matrix) + par3d(windowRect = c(0, 0, 800, 800)) + # Label or not and then set the view. + if (label) { + text3d(x = 0, y = 0, z = 1.5, text = gene, cex = 3) + text3d(x = 1.4, y = 0, z = -0.3, cex = 2.25, text = "Dor") + text3d(x = -1.4, y = 0, z = -0.3, cex = 2.25, text = "Ven") + text3d(x = 0, y = 0, z = 1.2, cex = 2.25, text = "An") + text3d(x = 0, y = 0, z = -1.2, cex = 2.25, text = "Veg") + } + rotMat <- rotationMatrix(-pi / 2, 0, 0, 1) %*% rotationMatrix(-pi / 2, 0, 1, 0) + view3d(zoom = .75, userMatrix = rotMat, fov = 0) +} + +#' @export +zf.insitu.ventral <- function(seuratObject, gene, label=TRUE, ...) { + # Call Seurat function to get the in situ values out. + expression <- CalcInsitu( + seuratObject, + gene, + do.plot = FALSE, + do.return = TRUE, + do.norm = TRUE, + ... + ) + # Reformat them into an expression matrix as expected by the plotting function + expression.matrix <- data.frame(matrix(data = expression, nrow = 8, ncol = 8)) + rownames(x = expression.matrix) <- c( + "24-30", + "17-23", + "13-16", + "9-12", + "7-8", + "5-6", + "3-4", + "1-2" + ) + names(x = expression.matrix) <- c( + "1-4", + "5-8", + "9-12", + "13-16", + "17-20", + "21-24", + "25-28", + "29-32" + ) + # Call the plotting function. + zf.insitu.side(expressionMatrix = expression.matrix) + par3d(windowRect = c(0, 0, 800, 800)) + # Label or not and then set the view. + if (label) { + text3d(x = 0, y = 0, z = 1.5, text = gene, cex = 3) + text3d(x = 1.4, y = 0, z = -0.3, cex = 2.25, text = "Dor") + text3d(x = -1.4, y = 0, z = -0.3, cex = 2.25, text = "Ven") + text3d(x = 0, y = 0, z = 1.2, cex = 2.25, text = "An") + text3d(x = 0, y = 0, z = -1.2, cex = 2.25, text = "Veg") + } + rotMat <- rotationMatrix(pi / 2, 0, 0, 1) %*% rotationMatrix(pi / 2, 0, 1, 0) + view3d(zoom = .75, userMatrix = rotMat, fov = 0) +} + +#' @export +zf.insitu.side <- function(expressionMatrix, nonmirror = TRUE, mirror = TRUE) { + # Determine geometry + tierBins <- 30 # 1 bin per cell tier. + DVBins <- 64 # 1 bin every 5.625 degrees; compatible with our current 8-bin system. + phiPerTier <- pi / (-2 * tierBins) + thetaPerDV <- 2 * pi / DVBins + # Determine colors + yolkColor <- "#FDF5E6" + marginColor <- "#CDC8B1" + insituPalette <- colorRampPalette( + colors = c("#FDF5E6", "#483D8B"), + space = "Lab" + ) + insituColors <- insituPalette(51) + # Make a dataframe that will hold the position of each quadrilateral for the drawing, default to yolk-colored. + # Top of the embryo + drawEmbryo <- data.frame( + tier = rep(x = 1:tierBins, DVBins), + DV = rep(x = 1:DVBins, each = tierBins), + color = rep(x = yolkColor, tierBins * DVBins), + stringsAsFactors = FALSE + ) + # The yolk part + drawEmbryo <- rbind( + drawEmbryo, + data.frame( + tier = rep(x = -tierBins:-1, DVBins), + DV = rep(x = 1:DVBins, each = tierBins), + color = rep(x = yolkColor, tierBins * DVBins), + stringsAsFactors = FALSE + ) + ) + # Add the margin + drawEmbryo <- rbind( + drawEmbryo, + data.frame( + tier = rep(x = 0, DVBins), + DV = 1:DVBins, + color = rep(x = marginColor, DVBins), + stringsAsFactors = FALSE + ) + ) + # Determine the 4 coordinates for each quadrilateral defined by a bin + drawEmbryo$x1 <- cos(x = pi - thetaPerDV * drawEmbryo$DV) * + sin(x = 0.5 * pi + phiPerTier * (drawEmbryo$tier - 1)) + drawEmbryo$x2 <- cos(x = pi - thetaPerDV * (drawEmbryo$DV - 1)) * + sin(x = 0.5 * pi + phiPerTier * (drawEmbryo$tier - 1)) + drawEmbryo$x3 <- cos(x = pi - thetaPerDV * (drawEmbryo$DV - 1)) * + sin(x = 0.5 * pi + phiPerTier * drawEmbryo$tier) + drawEmbryo$x4 <- cos(x = pi - thetaPerDV * drawEmbryo$DV) * + sin(x = 0.5 * pi + phiPerTier * drawEmbryo$tier) + drawEmbryo$y1 <- sin(x = pi - thetaPerDV * drawEmbryo$DV) * + sin(x = 0.5 * pi + phiPerTier * (drawEmbryo$tier - 1)) + drawEmbryo$y2 <- sin(x = pi - thetaPerDV * (drawEmbryo$DV - 1)) * + sin(x = 0.5 * pi + phiPerTier * (drawEmbryo$tier - 1)) + drawEmbryo$y3 <- sin(x = pi - thetaPerDV * (drawEmbryo$DV - 1)) * + sin(0.5 * pi + phiPerTier * drawEmbryo$tier) + drawEmbryo$y4 <- sin(x = pi - thetaPerDV * drawEmbryo$DV) * + sin(x = 0.5 * pi + phiPerTier * drawEmbryo$tier) + drawEmbryo$z1 <- cos(x = 0.5 * pi + phiPerTier * (drawEmbryo$tier - 1)) + drawEmbryo$z2 <- cos(x = 0.5 * pi + phiPerTier * (drawEmbryo$tier - 1)) + drawEmbryo$z3 <- cos(x = 0.5 * pi + phiPerTier * drawEmbryo$tier) + drawEmbryo$z4 <- cos(x = 0.5 * pi + phiPerTier * drawEmbryo$tier) + # Now, reassign the color for each of the bins that has expression >0. + for (tier in 1:dim(x = expressionMatrix)[1]) { + for (DV in 1:dim(x = expressionMatrix)[2]) { + if (! expressionMatrix[tier, DV] == 0 ) { + # Figure out limits of the bins desired from the names of the row & col of this table cell + tierLimits <- as.numeric(x = unlist(x = strsplit( + x = row.names(x = expressionMatrix)[tier], + split = "-" + ))) + DVLimits <- as.numeric(x = unlist(x = strsplit( + x = names(x = expressionMatrix)[DV], + split = "-" + ))) + # Figure out the value for this color. + thisColor <- insituColors[(floor( + x = as.numeric(x = expressionMatrix[tier, DV]) * 50 + )) + 1] + # Loop through and assign the color to every bin in the limits + for (thisTier in min(tierLimits):max(tierLimits)) { + if (nonmirror) { + for (thisDV in min(DVLimits):max(DVLimits)) { + thisRow <- (thisDV - 1) * tierBins + thisTier + drawEmbryo[thisRow, ]$color <- thisColor + } + } + # If mirror is on, also assign the other side of the embryo. + if (mirror) { + for (thisDV in (DVBins - max(DVLimits) + 1):(DVBins - min(DVLimits) + 1)) { + thisRow <- (thisDV - 1) * tierBins + thisTier + drawEmbryo[thisRow, ]$color <- thisColor + } + } + } + + } + } + } + # Take the coordinates and reformat the lists to pass to RGL + quadX <- interleave( + drawEmbryo$x1, + drawEmbryo$x2, + drawEmbryo$x3, + drawEmbryo$x4, + drop = TRUE + ) + dim(x = quadX) <- c(dim(x = quadX)[1] * dim(x = quadX)[2], 1) + quadY <- interleave( + drawEmbryo$y1, + drawEmbryo$y2, + drawEmbryo$y3, + drawEmbryo$y4, + drop = TRUE + ) + dim(x = quadY) <- c(dim(x = quadY)[1] * dim(x = quadY)[2], 1) + quadZ <- interleave( + drawEmbryo$z1, + drawEmbryo$z2, + drawEmbryo$z3, + drawEmbryo$z4, + drop = TRUE + ) + dim(x = quadZ) <- c(dim(x = quadZ)[1] * dim(x = quadZ)[2], 1) + quadColor <- rep(x = drawEmbryo$color, each = 4) + # Initialize an RGL view + open3d() + # Call quads to plot the embryo. + quads3d( + x = quadX, + y = quadY, + z = quadZ, + color = quadColor, + alpha = 1, + lit = FALSE + ) +} + +#used for zebrafish plotting +vp.layout <- function(x, y) { + viewport(layout.pos.row = x, layout.pos.col = y) +} diff --git a/README.md b/README.md new file mode 100644 index 000000000..1fe743530 --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +[![Build Status](https://travis-ci.org/satijalab/seurat.svg?branch=develop)](https://travis-ci.org/satijalab/seurat)[![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/satijalab/seurat?branch=develop&svg=true)](https://ci.appveyor.com/project/satijalab/seurat/branch/develop) + +# Seurat v2.0 + +Seurat is an R toolkit for single cell genomics, developed and maintained by the Satija Lab at NYGC. + +Instructions, documentation, and tutorials can be found at: +* http://www.satijalab.org/seurat + +Seurat is also hosted on GitHub, you can view and clone the repository at +* https://github.com/satijalab/seurat + +Seurat has been successfully installed on Mac OS X, Linux, and Windows, using the devtools package to install directly from GitHub + +Improvements and new features will be added on a regular basis, please contact seuratpackage@gmail.com with any questions or if you would like to contribute + +Version History + +July 17, 2017 +* Version 2.0 pre-release +* Changes: + * Preprint released for integrated analysis of scRNA-seq across conditions, technologies and species + * Significant restructuring of code to support clarity and dataset exploration + +October 4, 2016 +* Version 1.4 released +* Changes: + * Improved tools for cluster evaluation/visualizations + * Methods for combining and adding to datasets + +August 22, 2016: +* Version 1.3 released +* Changes : + * Improved clustering approach - see FAQ for details + * All functions support sparse matrices + * Methods for removing unwanted sources of variation + * Consistent function names + * Updated visualizations + +May 21, 2015: +* Drop-Seq manuscript published. Version 1.2 released +* Changes : + * Added support for spectral t-SNE and density clustering + * New visualizations - including pcHeatmap, dot.plot, and feature.plot + * Expanded package documentation, reduced import package burden + * Seurat code is now hosted on GitHub, enables easy install through devtools + * Small bug fixes + +April 13, 2015: +* Spatial mapping manuscript published. Version 1.1 released (initial release) diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 000000000..8ff07ec22 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,46 @@ +# DO NOT CHANGE the "init" and "install" sections below + +# Download script file from GitHub +init: + ps: | + $ErrorActionPreference = "Stop" + Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" + Import-Module '..\appveyor-tool.ps1' +install: + ps: Bootstrap + +# Adapt as necessary starting from here + +environment: + global: + R_ARCH: x64 + USE_RTOOLS: true + +build_script: + - travis-tool.sh install_deps + +test_script: + - travis-tool.sh run_tests + +on_failure: + - 7z a failure.zip *.Rcheck\* + - appveyor PushArtifact failure.zip + +artifacts: + - path: '*.Rcheck\**\*.log' + name: Logs + + - path: '*.Rcheck\**\*.out' + name: Logs + + - path: '*.Rcheck\**\*.fail' + name: Logs + + - path: '*.Rcheck\**\*.Rout' + name: Logs + + - path: '\*_*.tar.gz' + name: Bits + + - path: '\*_*.zip' + name: Bits diff --git a/inst/java/ModularityOptimizer.jar b/inst/java/ModularityOptimizer.jar new file mode 100644 index 000000000..496015a5b Binary files /dev/null and b/inst/java/ModularityOptimizer.jar differ diff --git a/java/Arrays2.java b/java/Arrays2.java new file mode 100755 index 000000000..1dee79694 --- /dev/null +++ b/java/Arrays2.java @@ -0,0 +1,116 @@ +/** + * Arrays2 + * + * @author Ludo Waltman + * @author Nees Jan van Eck + * @version 1.3.1, 11/17/14 + */ + +import java.util.Arrays; +import java.util.Random; + +public class Arrays2 +{ + public static double calcSum(double[] value) + { + double sum; + int i; + + sum = 0; + for (i = 0; i < value.length; i++) + sum += value[i]; + return sum; + } + + public static double calcSum(double[] value, int beginIndex, int endIndex) + { + double sum; + int i; + + sum = 0; + for (i = beginIndex; i < endIndex; i++) + sum += value[i]; + return sum; + } + + public static double calcAverage(double[] value) + { + double average; + int i; + + average = 0; + for (i = 0; i < value.length; i++) + average += value[i]; + average /= value.length; + return average; + } + + public static double calcMedian(double[] value) + { + double median; + double[] sortedValue; + + sortedValue = (double[])value.clone(); + Arrays.sort(sortedValue); + if (sortedValue.length % 2 == 1) + median = sortedValue[(sortedValue.length - 1) / 2]; + else + median = (sortedValue[sortedValue.length / 2 - 1] + sortedValue[sortedValue.length / 2]) / 2; + return median; + } + + public static double calcMinimum(double[] value) + { + double minimum; + int i; + + minimum = value[0]; + for (i = 1; i < value.length; i++) + minimum = Math.min(minimum, value[i]); + return minimum; + } + + public static double calcMaximum(double[] value) + { + double maximum; + int i; + + maximum = value[0]; + for (i = 1; i < value.length; i++) + maximum = Math.max(maximum, value[i]); + return maximum; + } + + public static int calcMaximum(int[] value) + { + int i, maximum; + + maximum = value[0]; + for (i = 1; i < value.length; i++) + maximum = Math.max(maximum, value[i]); + return maximum; + } + + public static int[] generateRandomPermutation(int nElements) + { + return generateRandomPermutation(nElements, new Random()); + } + + public static int[] generateRandomPermutation(int nElements, Random random) + { + int i, j, k; + int[] permutation; + + permutation = new int[nElements]; + for (i = 0; i < nElements; i++) + permutation[i] = i; + for (i = 0; i < nElements; i++) + { + j = random.nextInt(nElements); + k = permutation[i]; + permutation[i] = permutation[j]; + permutation[j] = k; + } + return permutation; + } +} diff --git a/java/Clustering.java b/java/Clustering.java new file mode 100755 index 000000000..d1f7a8c1a --- /dev/null +++ b/java/Clustering.java @@ -0,0 +1,198 @@ +/** + * Clustering + * + * @author Ludo Waltman + * @author Nees Jan van Eck + * @version 1.3.1 11/17/14 + */ + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.Arrays; + +public class Clustering implements Cloneable, Serializable +{ + private static final long serialVersionUID = 1; + + protected int nNodes; + protected int nClusters; + protected int[] cluster; + + public static Clustering load(String fileName) throws ClassNotFoundException, IOException + { + Clustering clustering; + ObjectInputStream objectInputStream; + + objectInputStream = new ObjectInputStream(new FileInputStream(fileName)); + + clustering = (Clustering)objectInputStream.readObject(); + + objectInputStream.close(); + + return clustering; + } + + public Clustering(int nNodes) + { + this.nNodes = nNodes; + cluster = new int[nNodes]; + nClusters = 1; + } + + public Clustering(int[] cluster) + { + nNodes = cluster.length; + this.cluster = (int[])cluster.clone(); + nClusters = Arrays2.calcMaximum(cluster) + 1; + } + + public Object clone() + { + Clustering clonedClustering; + + try + { + clonedClustering = (Clustering)super.clone(); + clonedClustering.cluster = getClusters(); + return clonedClustering; + } + catch (CloneNotSupportedException e) + { + return null; + } + } + + public void save(String fileName) throws IOException + { + ObjectOutputStream objectOutputStream; + + objectOutputStream = new ObjectOutputStream(new FileOutputStream(fileName)); + + objectOutputStream.writeObject(this); + + objectOutputStream.close(); + } + + public int getNNodes() + { + return nNodes; + } + + public int getNClusters() + { + return nClusters; + } + + public int[] getClusters() + { + return (int[])cluster.clone(); + } + + public int getCluster(int node) + { + return cluster[node]; + } + + public int[] getNNodesPerCluster() + { + int i; + int[] nNodesPerCluster; + + nNodesPerCluster = new int[nClusters]; + for (i = 0; i < nNodes; i++) + nNodesPerCluster[cluster[i]]++; + return nNodesPerCluster; + } + + public int[][] getNodesPerCluster() + { + int i; + int[] nNodesPerCluster; + int[][] nodePerCluster; + + nodePerCluster = new int[nClusters][]; + nNodesPerCluster = getNNodesPerCluster(); + for (i = 0; i < nClusters; i++) + { + nodePerCluster[i] = new int[nNodesPerCluster[i]]; + nNodesPerCluster[i] = 0; + } + for (i = 0; i < nNodes; i++) + { + nodePerCluster[cluster[i]][nNodesPerCluster[cluster[i]]] = i; + nNodesPerCluster[cluster[i]]++; + } + return nodePerCluster; + } + + public void setCluster(int node, int cluster) + { + this.cluster[node] = cluster; + nClusters = Math.max(nClusters, cluster + 1); + } + + public void initSingletonClusters() + { + int i; + + for (i = 0; i < nNodes; i++) + cluster[i] = i; + nClusters = nNodes; + } + + public void orderClustersByNNodes() + { + class ClusterNNodes implements Comparable + { + public int cluster; + public int nNodes; + + public ClusterNNodes(int cluster, int nNodes) + { + this.cluster = cluster; + this.nNodes = nNodes; + } + + public int compareTo(ClusterNNodes clusterNNodes) + { + return (clusterNNodes.nNodes > nNodes) ? 1 : ((clusterNNodes.nNodes < nNodes) ? -1 : 0); + } + } + + ClusterNNodes[] clusterNNodes; + int i; + int[] newCluster, nNodesPerCluster; + + nNodesPerCluster = getNNodesPerCluster(); + clusterNNodes = new ClusterNNodes[nClusters]; + for (i = 0; i < nClusters; i++) + clusterNNodes[i] = new ClusterNNodes(i, nNodesPerCluster[i]); + + Arrays.sort(clusterNNodes); + + newCluster = new int[nClusters]; + i = 0; + do + { + newCluster[clusterNNodes[i].cluster] = i; + i++; + } + while ((i < nClusters) && (clusterNNodes[i].nNodes > 0)); + nClusters = i; + for (i = 0; i < nNodes; i++) + cluster[i] = newCluster[cluster[i]]; + } + + public void mergeClusters(Clustering clustering) + { + int i; + + for (i = 0; i < nNodes; i++) + cluster[i] = clustering.cluster[cluster[i]]; + nClusters = clustering.nClusters; + } +} diff --git a/java/ModularityOptimizer.java b/java/ModularityOptimizer.java new file mode 100755 index 000000000..247ebee3d --- /dev/null +++ b/java/ModularityOptimizer.java @@ -0,0 +1,257 @@ +/** + * ModularityOptimizer + * + * @author Ludo Waltman + * @author Nees Jan van Eck + * @version 1.3.0, 08/31/15 + */ + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.Console; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Arrays; +import java.util.Random; + +public class ModularityOptimizer +{ + public static void main(String[] args) throws IOException + { + boolean printOutput, update; + Clustering clustering; + Console console; + double modularity, maxModularity, resolution, resolution2; + int algorithm, i, j, modularityFunction, nIterations, nRandomStarts; + long beginTime, endTime, randomSeed; + Network network; + Random random; + String inputFileName, outputFileName; + VOSClusteringTechnique VOSClusteringTechnique; + + if (args.length == 9) + { + inputFileName = args[0]; + outputFileName = args[1]; + modularityFunction = Integer.parseInt(args[2]); + resolution = Double.parseDouble(args[3]); + algorithm = Integer.parseInt(args[4]); + nRandomStarts = Integer.parseInt(args[5]); + nIterations = Integer.parseInt(args[6]); + randomSeed = Long.parseLong(args[7]); + printOutput = (Integer.parseInt(args[8]) > 0); + + if (printOutput) + { + System.out.println("Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck"); + System.out.println(); + } + } + else + { + console = System.console(); + System.out.println("Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck"); + System.out.println(); + inputFileName = console.readLine("Input file name: "); + outputFileName = console.readLine("Output file name: "); + modularityFunction = Integer.parseInt(console.readLine("Modularity function (1 = standard; 2 = alternative): ")); + resolution = Double.parseDouble(console.readLine("Resolution parameter (e.g., 1.0): ")); + algorithm = Integer.parseInt(console.readLine("Algorithm (1 = Louvain; 2 = Louvain with multilevel refinement; 3 = smart local moving): ")); + nRandomStarts = Integer.parseInt(console.readLine("Number of random starts (e.g., 10): ")); + nIterations = Integer.parseInt(console.readLine("Number of iterations (e.g., 10): ")); + randomSeed = Long.parseLong(console.readLine("Random seed (e.g., 0): ")); + printOutput = (Integer.parseInt(console.readLine("Print output (0 = no; 1 = yes): ")) > 0); + System.out.println(); + } + + if (printOutput) + { + System.out.println("Reading input file..."); + System.out.println(); + } + + network = readInputFile(inputFileName, modularityFunction); + + if (printOutput) + { + System.out.format("Number of nodes: %d%n", network.getNNodes()); + System.out.format("Number of edges: %d%n", network.getNEdges()); + System.out.println(); + System.out.println("Running " + ((algorithm == 1) ? "Louvain algorithm" : ((algorithm == 2) ? "Louvain algorithm with multilevel refinement" : "smart local moving algorithm")) + "..."); + System.out.println(); + } + + resolution2 = ((modularityFunction == 1) ? (resolution / (2 * network.getTotalEdgeWeight() + network.totalEdgeWeightSelfLinks)) : resolution); + + beginTime = System.currentTimeMillis(); + clustering = null; + maxModularity = Double.NEGATIVE_INFINITY; + random = new Random(randomSeed); + for (i = 0; i < nRandomStarts; i++) + { + if (printOutput && (nRandomStarts > 1)) + System.out.format("Random start: %d%n", i + 1); + + VOSClusteringTechnique = new VOSClusteringTechnique(network, resolution2); + + j = 0; + update = true; + do + { + if (printOutput && (nIterations > 1)) + System.out.format("Iteration: %d%n", j + 1); + + if (algorithm == 1) + update = VOSClusteringTechnique.runLouvainAlgorithm(random); + else if (algorithm == 2) + update = VOSClusteringTechnique.runLouvainAlgorithmWithMultilevelRefinement(random); + else if (algorithm == 3) + VOSClusteringTechnique.runSmartLocalMovingAlgorithm(random); + j++; + + modularity = VOSClusteringTechnique.calcQualityFunction(); + + if (printOutput && (nIterations > 1)) + System.out.format("Modularity: %.4f%n", modularity); + } + while ((j < nIterations) && update); + + if (modularity > maxModularity) + { + clustering = VOSClusteringTechnique.getClustering(); + maxModularity = modularity; + } + + if (printOutput && (nRandomStarts > 1)) + { + if (nIterations == 1) + System.out.format("Modularity: %.4f%n", modularity); + System.out.println(); + } + } + endTime = System.currentTimeMillis(); + + if (printOutput) + { + if (nRandomStarts == 1) + { + if (nIterations > 1) + System.out.println(); + System.out.format("Modularity: %.4f%n", maxModularity); + } + else + System.out.format("Maximum modularity in %d random starts: %.4f%n", nRandomStarts, maxModularity); + System.out.format("Number of communities: %d%n", clustering.getNClusters()); + System.out.format("Elapsed time: %d seconds%n", Math.round((endTime - beginTime) / 1000.0)); + System.out.println(); + System.out.println("Writing output file..."); + System.out.println(); + } + + writeOutputFile(outputFileName, clustering); + } + + private static Network readInputFile(String fileName, int modularityFunction) throws IOException + { + BufferedReader bufferedReader; + double[] edgeWeight1, edgeWeight2, nodeWeight; + int i, j, nEdges, nLines, nNodes; + int[] firstNeighborIndex, neighbor, nNeighbors, node1, node2; + Network network; + String[] splittedLine; + + bufferedReader = new BufferedReader(new FileReader(fileName)); + + nLines = 0; + while (bufferedReader.readLine() != null) + nLines++; + + bufferedReader.close(); + + bufferedReader = new BufferedReader(new FileReader(fileName)); + + node1 = new int[nLines]; + node2 = new int[nLines]; + edgeWeight1 = new double[nLines]; + i = -1; + for (j = 0; j < nLines; j++) + { + splittedLine = bufferedReader.readLine().split("\t"); + node1[j] = Integer.parseInt(splittedLine[0]); + if (node1[j] > i) + i = node1[j]; + node2[j] = Integer.parseInt(splittedLine[1]); + if (node2[j] > i) + i = node2[j]; + edgeWeight1[j] = (splittedLine.length > 2) ? Double.parseDouble(splittedLine[2]) : 1; + } + nNodes = i + 1; + + bufferedReader.close(); + + nNeighbors = new int[nNodes]; + for (i = 0; i < nLines; i++) + if (node1[i] < node2[i]) + { + nNeighbors[node1[i]]++; + nNeighbors[node2[i]]++; + } + + firstNeighborIndex = new int[nNodes + 1]; + nEdges = 0; + for (i = 0; i < nNodes; i++) + { + firstNeighborIndex[i] = nEdges; + nEdges += nNeighbors[i]; + } + firstNeighborIndex[nNodes] = nEdges; + + neighbor = new int[nEdges]; + edgeWeight2 = new double[nEdges]; + Arrays.fill(nNeighbors, 0); + for (i = 0; i < nLines; i++) + if (node1[i] < node2[i]) + { + j = firstNeighborIndex[node1[i]] + nNeighbors[node1[i]]; + neighbor[j] = node2[i]; + edgeWeight2[j] = edgeWeight1[i]; + nNeighbors[node1[i]]++; + j = firstNeighborIndex[node2[i]] + nNeighbors[node2[i]]; + neighbor[j] = node1[i]; + edgeWeight2[j] = edgeWeight1[i]; + nNeighbors[node2[i]]++; + } + + if (modularityFunction == 1) + network = new Network(nNodes, firstNeighborIndex, neighbor, edgeWeight2); + else + { + nodeWeight = new double[nNodes]; + Arrays.fill(nodeWeight, 1); + network = new Network(nNodes, nodeWeight, firstNeighborIndex, neighbor, edgeWeight2); + } + + return network; + } + + private static void writeOutputFile(String fileName, Clustering clustering) throws IOException + { + BufferedWriter bufferedWriter; + int i, nNodes; + + nNodes = clustering.getNNodes(); + + clustering.orderClustersByNNodes(); + + bufferedWriter = new BufferedWriter(new FileWriter(fileName)); + + for (i = 0; i < nNodes; i++) + { + bufferedWriter.write(Integer.toString(clustering.getCluster(i))); + bufferedWriter.newLine(); + } + + bufferedWriter.close(); + } +} diff --git a/java/Network.java b/java/Network.java new file mode 100755 index 000000000..751f691d6 --- /dev/null +++ b/java/Network.java @@ -0,0 +1,716 @@ +/** + * Network + * + * @author Ludo Waltman + * @author Nees Jan van Eck + * @version 1.3.1, 08/30/15 + */ + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Random; + +public class Network implements Serializable +{ + private static final long serialVersionUID = 1; + + protected int nNodes; + protected int nEdges; + protected double[] nodeWeight; + protected int[] firstNeighborIndex; + protected int[] neighbor; + protected double[] edgeWeight; + protected double totalEdgeWeightSelfLinks; + + public static Network load(String fileName) throws ClassNotFoundException, IOException + { + Network network; + ObjectInputStream objectInputStream; + + objectInputStream = new ObjectInputStream(new FileInputStream(fileName)); + + network = (Network)objectInputStream.readObject(); + + objectInputStream.close(); + + return network; + } + + public Network(int nNodes, int[][] edge) + { + this(nNodes, null, edge, null); + } + + public Network(int nNodes, double[] nodeWeight, int[][] edge) + { + this(nNodes, nodeWeight, edge, null); + } + + public Network(int nNodes, int[][] edge, double[] edgeWeight) + { + this(nNodes, null, edge, edgeWeight); + } + + public Network(int nNodes, double[] nodeWeight, int[][] edge, double[] edgeWeight) + { + double[] edgeWeight2; + int i, j; + int[] neighbor; + + this.nNodes = nNodes; + + nEdges = 0; + firstNeighborIndex = new int[nNodes + 1]; + neighbor = new int[edge[0].length]; + edgeWeight2 = new double[edge[0].length]; + totalEdgeWeightSelfLinks = 0; + i = 1; + for (j = 0; j < edge[0].length; j++) + if (edge[0][j] != edge[1][j]) + { + if (edge[0][j] >= i) + for (; i <= edge[0][j]; i++) + firstNeighborIndex[i] = nEdges; + neighbor[nEdges] = edge[1][j]; + edgeWeight2[nEdges] = (edgeWeight != null) ? edgeWeight[j] : 1; + nEdges++; + } + else + totalEdgeWeightSelfLinks += (edgeWeight != null) ? edgeWeight[j] : 1; + for (; i <= nNodes; i++) + firstNeighborIndex[i] = nEdges; + this.neighbor = Arrays.copyOfRange(neighbor, 0, nEdges); + this.edgeWeight = Arrays.copyOfRange(edgeWeight2, 0, nEdges); + + this.nodeWeight = (nodeWeight != null) ? (double[])nodeWeight.clone() : getTotalEdgeWeightPerNode(); + } + + public Network(int nNodes, int[] firstNeighborIndex, int[] neighbor) + { + this(nNodes, null, firstNeighborIndex, neighbor, null); + } + + public Network(int nNodes, double[] nodeWeight, int[] firstNeighborIndex, int[] neighbor) + { + this(nNodes, nodeWeight, firstNeighborIndex, neighbor, null); + } + + public Network(int nNodes, int[] firstNeighborIndex, int[] neighbor, double[] edgeWeight) + { + this(nNodes, null, firstNeighborIndex, neighbor, edgeWeight); + } + + public Network(int nNodes, double[] nodeWeight, int[] firstNeighborIndex, int[] neighbor, double[] edgeWeight) + { + this.nNodes = nNodes; + + nEdges = neighbor.length; + this.firstNeighborIndex = (int[])firstNeighborIndex.clone(); + this.neighbor = (int[])neighbor.clone(); + if (edgeWeight != null) + this.edgeWeight = (double[])edgeWeight.clone(); + else + { + this.edgeWeight = new double[nEdges]; + Arrays.fill(this.edgeWeight, 1); + } + totalEdgeWeightSelfLinks = 0; + + this.nodeWeight = (nodeWeight != null) ? (double[])nodeWeight.clone() : getTotalEdgeWeightPerNode(); + } + + public void save(String fileName) throws IOException + { + ObjectOutputStream objectOutputStream; + + objectOutputStream = new ObjectOutputStream(new FileOutputStream(fileName)); + + objectOutputStream.writeObject(this); + + objectOutputStream.close(); + } + + public int getNNodes() + { + return nNodes; + } + + public double getTotalNodeWeight() + { + return Arrays2.calcSum(nodeWeight); + } + + public double[] getNodeWeights() + { + return (double[])nodeWeight.clone(); + } + + public double getNodeWeight(int node) + { + return nodeWeight[node]; + } + + public int getNEdges() + { + return nEdges / 2; + } + + public int getNEdges(int node) + { + return firstNeighborIndex[node + 1] - firstNeighborIndex[node]; + } + + public int[] getNEdgesPerNode() + { + int i; + int[] nEdgesPerNode; + + nEdgesPerNode = new int[nNodes]; + for (i = 0; i < nNodes; i++) + nEdgesPerNode[i] = firstNeighborIndex[i + 1] - firstNeighborIndex[i]; + return nEdgesPerNode; + } + + public int[][] getEdges() + { + int i; + int[][] edge; + + edge = new int[2][]; + edge[0] = new int[nEdges]; + for (i = 0; i < nNodes; i++) + Arrays.fill(edge[0], firstNeighborIndex[i], firstNeighborIndex[i + 1], i); + edge[1] = (int[])neighbor.clone(); + return edge; + } + + public int[] getEdges(int node) + { + return Arrays.copyOfRange(neighbor, firstNeighborIndex[node], firstNeighborIndex[node + 1]); + } + + public int[][] getEdgesPerNode() + { + int i; + int[][] edgePerNode; + + edgePerNode = new int[nNodes][]; + for (i = 0; i < nNodes; i++) + edgePerNode[i] = Arrays.copyOfRange(neighbor, firstNeighborIndex[i], firstNeighborIndex[i + 1]); + return edgePerNode; + } + + public double getTotalEdgeWeight() + { + return Arrays2.calcSum(edgeWeight) / 2; + } + + public double getTotalEdgeWeight(int node) + { + return Arrays2.calcSum(edgeWeight, firstNeighborIndex[node], firstNeighborIndex[node + 1]); + } + + public double[] getTotalEdgeWeightPerNode() + { + double[] totalEdgeWeightPerNode; + int i; + + totalEdgeWeightPerNode = new double[nNodes]; + for (i = 0; i < nNodes; i++) + totalEdgeWeightPerNode[i] = Arrays2.calcSum(edgeWeight, firstNeighborIndex[i], firstNeighborIndex[i + 1]); + return totalEdgeWeightPerNode; + } + + public double[] getEdgeWeights() + { + return (double[])edgeWeight.clone(); + } + + public double[] getEdgeWeights(int node) + { + return Arrays.copyOfRange(edgeWeight, firstNeighborIndex[node], firstNeighborIndex[node + 1]); + } + + public double[][] getEdgeWeightsPerNode() + { + double[][] edgeWeightPerNode; + int i; + + edgeWeightPerNode = new double[nNodes][]; + for (i = 0; i < nNodes; i++) + edgeWeightPerNode[i] = Arrays.copyOfRange(edgeWeight, firstNeighborIndex[i], firstNeighborIndex[i + 1]); + return edgeWeightPerNode; + } + + public double getTotalEdgeWeightSelfLinks() + { + return totalEdgeWeightSelfLinks; + } + + public Network createNetworkWithoutNodeWeights() + { + Network networkWithoutNodeWeights; + + networkWithoutNodeWeights = new Network(); + networkWithoutNodeWeights.nNodes = nNodes; + networkWithoutNodeWeights.nEdges = nEdges; + networkWithoutNodeWeights.nodeWeight = new double[nNodes]; + Arrays.fill(networkWithoutNodeWeights.nodeWeight, 1); + networkWithoutNodeWeights.firstNeighborIndex = firstNeighborIndex; + networkWithoutNodeWeights.neighbor = neighbor; + networkWithoutNodeWeights.edgeWeight = edgeWeight; + networkWithoutNodeWeights.totalEdgeWeightSelfLinks = totalEdgeWeightSelfLinks; + return networkWithoutNodeWeights; + } + + public Network createNetworkWithoutEdgeWeights() + { + Network networkWithoutEdgeWeights; + + networkWithoutEdgeWeights = new Network(); + networkWithoutEdgeWeights.nNodes = nNodes; + networkWithoutEdgeWeights.nEdges = nEdges; + networkWithoutEdgeWeights.nodeWeight = nodeWeight; + networkWithoutEdgeWeights.firstNeighborIndex = firstNeighborIndex; + networkWithoutEdgeWeights.neighbor = neighbor; + networkWithoutEdgeWeights.edgeWeight = new double[nEdges]; + Arrays.fill(networkWithoutEdgeWeights.edgeWeight, 1); + networkWithoutEdgeWeights.totalEdgeWeightSelfLinks = 0; + return networkWithoutEdgeWeights; + } + + public Network createNetworkWithoutNodeAndEdgeWeights() + { + Network networkWithoutNodeAndEdgeWeights; + + networkWithoutNodeAndEdgeWeights = new Network(); + networkWithoutNodeAndEdgeWeights.nNodes = nNodes; + networkWithoutNodeAndEdgeWeights.nEdges = nEdges; + networkWithoutNodeAndEdgeWeights.nodeWeight = new double[nNodes]; + Arrays.fill(networkWithoutNodeAndEdgeWeights.nodeWeight, 1); + networkWithoutNodeAndEdgeWeights.firstNeighborIndex = firstNeighborIndex; + networkWithoutNodeAndEdgeWeights.neighbor = neighbor; + networkWithoutNodeAndEdgeWeights.edgeWeight = new double[nEdges]; + Arrays.fill(networkWithoutNodeAndEdgeWeights.edgeWeight, 1); + networkWithoutNodeAndEdgeWeights.totalEdgeWeightSelfLinks = 0; + return networkWithoutNodeAndEdgeWeights; + } + + public Network createNormalizedNetwork1() + { + double totalNodeWeight; + int i, j; + Network normalizedNetwork; + + normalizedNetwork = new Network(); + + normalizedNetwork.nNodes = nNodes; + normalizedNetwork.nEdges = nEdges; + normalizedNetwork.nodeWeight = new double[nNodes]; + Arrays.fill(normalizedNetwork.nodeWeight, 1); + normalizedNetwork.firstNeighborIndex = firstNeighborIndex; + normalizedNetwork.neighbor = neighbor; + + normalizedNetwork.edgeWeight = new double[nEdges]; + totalNodeWeight = getTotalNodeWeight(); + for (i = 0; i < nNodes; i++) + for (j = firstNeighborIndex[i]; j < firstNeighborIndex[i + 1]; j++) + normalizedNetwork.edgeWeight[j] = edgeWeight[j] / ((nodeWeight[i] * nodeWeight[neighbor[j]]) / totalNodeWeight); + + normalizedNetwork.totalEdgeWeightSelfLinks = 0; + + return normalizedNetwork; + } + + public Network createNormalizedNetwork2() + { + int i, j; + Network normalizedNetwork; + + normalizedNetwork = new Network(); + + normalizedNetwork.nNodes = nNodes; + normalizedNetwork.nEdges = nEdges; + normalizedNetwork.nodeWeight = new double[nNodes]; + Arrays.fill(normalizedNetwork.nodeWeight, 1); + normalizedNetwork.firstNeighborIndex = firstNeighborIndex; + normalizedNetwork.neighbor = neighbor; + + normalizedNetwork.edgeWeight = new double[nEdges]; + for (i = 0; i < nNodes; i++) + for (j = firstNeighborIndex[i]; j < firstNeighborIndex[i + 1]; j++) + normalizedNetwork.edgeWeight[j] = edgeWeight[j] / (2 / (nNodes / nodeWeight[i] + nNodes / nodeWeight[neighbor[j]])); + + normalizedNetwork.totalEdgeWeightSelfLinks = 0; + + return normalizedNetwork; + } + + public Network createPrunedNetwork(int nEdges) + { + return createPrunedNetwork(nEdges, new Random()); + } + + public Network createPrunedNetwork(int nEdges, Random random) + { + double edgeWeightThreshold, randomNumberThreshold; + double[] edgeWeight, randomNumber; + int i, j, k, nEdgesAboveThreshold, nEdgesAtThreshold; + int[] nodePermutation; + Network prunedNetwork; + + nEdges *= 2; + + if (nEdges >= this.nEdges) + return this; + + edgeWeight = new double[this.nEdges / 2]; + i = 0; + for (j = 0; j < nNodes; j++) + for (k = firstNeighborIndex[j]; k < firstNeighborIndex[j + 1]; k++) + if (neighbor[k] < j) + { + edgeWeight[i] = this.edgeWeight[k]; + i++; + } + Arrays.sort(edgeWeight); + edgeWeightThreshold = edgeWeight[(this.nEdges - nEdges) / 2]; + + nEdgesAboveThreshold = 0; + while (edgeWeight[this.nEdges / 2 - nEdgesAboveThreshold - 1] > edgeWeightThreshold) + nEdgesAboveThreshold++; + nEdgesAtThreshold = 0; + while ((nEdgesAboveThreshold + nEdgesAtThreshold < this.nEdges / 2) && (edgeWeight[this.nEdges / 2 - nEdgesAboveThreshold - nEdgesAtThreshold - 1] == edgeWeightThreshold)) + nEdgesAtThreshold++; + + nodePermutation = Arrays2.generateRandomPermutation(nNodes, random); + + randomNumber = new double[nEdgesAtThreshold]; + i = 0; + for (j = 0; j < nNodes; j++) + for (k = firstNeighborIndex[j]; k < firstNeighborIndex[j + 1]; k++) + if ((neighbor[k] < j) && (this.edgeWeight[k] == edgeWeightThreshold)) + { + randomNumber[i] = generateRandomNumber(j, neighbor[k], nodePermutation); + i++; + } + Arrays.sort(randomNumber); + randomNumberThreshold = randomNumber[nEdgesAboveThreshold + nEdgesAtThreshold - nEdges / 2]; + + prunedNetwork = new Network(); + + prunedNetwork.nNodes = nNodes; + prunedNetwork.nEdges = nEdges; + prunedNetwork.nodeWeight = nodeWeight; + + prunedNetwork.firstNeighborIndex = new int[nNodes + 1]; + prunedNetwork.neighbor = new int[nEdges]; + prunedNetwork.edgeWeight = new double[nEdges]; + i = 0; + for (j = 0; j < nNodes; j++) + { + for (k = firstNeighborIndex[j]; k < firstNeighborIndex[j + 1]; k++) + if ((this.edgeWeight[k] > edgeWeightThreshold) || ((this.edgeWeight[k] == edgeWeightThreshold) && (generateRandomNumber(j, neighbor[k], nodePermutation) >= randomNumberThreshold))) + { + prunedNetwork.neighbor[i] = neighbor[k]; + prunedNetwork.edgeWeight[i] = this.edgeWeight[k]; + i++; + } + prunedNetwork.firstNeighborIndex[j + 1] = i; + } + + prunedNetwork.totalEdgeWeightSelfLinks = totalEdgeWeightSelfLinks; + + return prunedNetwork; + } + + public Network createSubnetwork(int[] node) + { + double[] subnetworkEdgeWeight; + int i, j, k; + int[] subnetworkNode, subnetworkNeighbor; + Network subnetwork; + + subnetwork = new Network(); + + subnetwork.nNodes = node.length; + + if (subnetwork.nNodes == 1) + { + subnetwork.nEdges = 0; + subnetwork.nodeWeight = new double[] {nodeWeight[node[0]]}; + subnetwork.firstNeighborIndex = new int[2]; + subnetwork.neighbor = new int[0]; + subnetwork.edgeWeight = new double[0]; + } + else + { + subnetworkNode = new int[nNodes]; + Arrays.fill(subnetworkNode, -1); + for (i = 0; i < node.length; i++) + subnetworkNode[node[i]] = i; + + subnetwork.nEdges = 0; + subnetwork.nodeWeight = new double[subnetwork.nNodes]; + subnetwork.firstNeighborIndex = new int[subnetwork.nNodes + 1]; + subnetworkNeighbor = new int[nEdges]; + subnetworkEdgeWeight = new double[nEdges]; + for (i = 0; i < subnetwork.nNodes; i++) + { + j = node[i]; + subnetwork.nodeWeight[i] = nodeWeight[j]; + for (k = firstNeighborIndex[j]; k < firstNeighborIndex[j + 1]; k++) + if (subnetworkNode[neighbor[k]] >= 0) + { + subnetworkNeighbor[subnetwork.nEdges] = subnetworkNode[neighbor[k]]; + subnetworkEdgeWeight[subnetwork.nEdges] = edgeWeight[k]; + subnetwork.nEdges++; + } + subnetwork.firstNeighborIndex[i + 1] = subnetwork.nEdges; + } + subnetwork.neighbor = Arrays.copyOfRange(subnetworkNeighbor, 0, subnetwork.nEdges); + subnetwork.edgeWeight = Arrays.copyOfRange(subnetworkEdgeWeight, 0, subnetwork.nEdges); + } + + subnetwork.totalEdgeWeightSelfLinks = 0; + + return subnetwork; + } + + public Network createSubnetwork(boolean[] nodeInSubnetwork) + { + int i, j; + int[] node; + + i = 0; + for (j = 0; j < nNodes; j++) + if (nodeInSubnetwork[j]) + i++; + node = new int[i]; + i = 0; + for (j = 0; j < nNodes; j++) + if (nodeInSubnetwork[j]) + { + node[i] = j; + i++; + } + return createSubnetwork(node); + } + + public Network createSubnetwork(Clustering clustering, int cluster) + { + double[] subnetworkEdgeWeight; + int[] subnetworkNeighbor, subnetworkNode; + int[][] nodePerCluster; + Network subnetwork; + + nodePerCluster = clustering.getNodesPerCluster(); + subnetworkNode = new int[nNodes]; + subnetworkNeighbor = new int[nEdges]; + subnetworkEdgeWeight = new double[nEdges]; + subnetwork = createSubnetwork(clustering, cluster, nodePerCluster[cluster], subnetworkNode, subnetworkNeighbor, subnetworkEdgeWeight); + return subnetwork; + } + + public Network[] createSubnetworks(Clustering clustering) + { + double[] subnetworkEdgeWeight; + int i; + int[] subnetworkNeighbor, subnetworkNode; + int[][] nodePerCluster; + Network[] subnetwork; + + subnetwork = new Network[clustering.nClusters]; + nodePerCluster = clustering.getNodesPerCluster(); + subnetworkNode = new int[nNodes]; + subnetworkNeighbor = new int[nEdges]; + subnetworkEdgeWeight = new double[nEdges]; + for (i = 0; i < clustering.nClusters; i++) + subnetwork[i] = createSubnetwork(clustering, i, nodePerCluster[i], subnetworkNode, subnetworkNeighbor, subnetworkEdgeWeight); + return subnetwork; + } + + public Network createSubnetworkLargestComponent() + { + return createSubnetwork(identifyComponents(), 0); + } + + public Network createReducedNetwork(Clustering clustering) + { + double[] reducedNetworkEdgeWeight1, reducedNetworkEdgeWeight2; + int i, j, k, l, m, n; + int[] reducedNetworkNeighbor1, reducedNetworkNeighbor2; + int[][] nodePerCluster; + Network reducedNetwork; + + reducedNetwork = new Network(); + + reducedNetwork.nNodes = clustering.nClusters; + + reducedNetwork.nEdges = 0; + reducedNetwork.nodeWeight = new double[clustering.nClusters]; + reducedNetwork.firstNeighborIndex = new int[clustering.nClusters + 1]; + reducedNetwork.totalEdgeWeightSelfLinks = totalEdgeWeightSelfLinks; + reducedNetworkNeighbor1 = new int[nEdges]; + reducedNetworkEdgeWeight1 = new double[nEdges]; + reducedNetworkNeighbor2 = new int[clustering.nClusters - 1]; + reducedNetworkEdgeWeight2 = new double[clustering.nClusters]; + nodePerCluster = clustering.getNodesPerCluster(); + for (i = 0; i < clustering.nClusters; i++) + { + j = 0; + for (k = 0; k < nodePerCluster[i].length; k++) + { + l = nodePerCluster[i][k]; + + reducedNetwork.nodeWeight[i] += nodeWeight[l]; + + for (m = firstNeighborIndex[l]; m < firstNeighborIndex[l + 1]; m++) + { + n = clustering.cluster[neighbor[m]]; + if (n != i) + { + if (reducedNetworkEdgeWeight2[n] == 0) + { + reducedNetworkNeighbor2[j] = n; + j++; + } + reducedNetworkEdgeWeight2[n] += edgeWeight[m]; + } + else + reducedNetwork.totalEdgeWeightSelfLinks += edgeWeight[m]; + } + } + + for (k = 0; k < j; k++) + { + reducedNetworkNeighbor1[reducedNetwork.nEdges + k] = reducedNetworkNeighbor2[k]; + reducedNetworkEdgeWeight1[reducedNetwork.nEdges + k] = reducedNetworkEdgeWeight2[reducedNetworkNeighbor2[k]]; + reducedNetworkEdgeWeight2[reducedNetworkNeighbor2[k]] = 0; + } + reducedNetwork.nEdges += j; + reducedNetwork.firstNeighborIndex[i + 1] = reducedNetwork.nEdges; + } + reducedNetwork.neighbor = Arrays.copyOfRange(reducedNetworkNeighbor1, 0, reducedNetwork.nEdges); + reducedNetwork.edgeWeight = Arrays.copyOfRange(reducedNetworkEdgeWeight1, 0, reducedNetwork.nEdges); + + return reducedNetwork; + } + + public Clustering identifyComponents() + { + boolean[] nodeVisited; + Clustering clustering; + int i, j, k, l; + int[] node; + + clustering = new Clustering(nNodes); + + clustering.nClusters = 0; + nodeVisited = new boolean[nNodes]; + node = new int[nNodes]; + for (i = 0; i < nNodes; i++) + if (!nodeVisited[i]) + { + clustering.cluster[i] = clustering.nClusters; + nodeVisited[i] = true; + node[0] = i; + j = 1; + k = 0; + do + { + for (l = firstNeighborIndex[node[k]]; l < firstNeighborIndex[node[k] + 1]; l++) + if (!nodeVisited[neighbor[l]]) + { + clustering.cluster[neighbor[l]] = clustering.nClusters; + nodeVisited[neighbor[l]] = true; + node[j] = neighbor[l]; + j++; + } + k++; + } + while (k < j); + + clustering.nClusters++; + } + + clustering.orderClustersByNNodes(); + + return clustering; + } + + private Network() + { + } + + private double generateRandomNumber(int node1, int node2, int[] nodePermutation) + { + int i, j; + Random random; + + if (node1 < node2) + { + i = node1; + j = node2; + } + else + { + i = node2; + j = node1; + } + random = new Random(nodePermutation[i] * nNodes + nodePermutation[j]); + return random.nextDouble(); + } + + private Network createSubnetwork(Clustering clustering, int cluster, int[] node, int[] subnetworkNode, int[] subnetworkNeighbor, double[] subnetworkEdgeWeight) + { + int i, j, k; + Network subnetwork; + + subnetwork = new Network(); + + subnetwork.nNodes = node.length; + + if (subnetwork.nNodes == 1) + { + subnetwork.nEdges = 0; + subnetwork.nodeWeight = new double[] {nodeWeight[node[0]]}; + subnetwork.firstNeighborIndex = new int[2]; + subnetwork.neighbor = new int[0]; + subnetwork.edgeWeight = new double[0]; + } + else + { + for (i = 0; i < node.length; i++) + subnetworkNode[node[i]] = i; + + subnetwork.nEdges = 0; + subnetwork.nodeWeight = new double[subnetwork.nNodes]; + subnetwork.firstNeighborIndex = new int[subnetwork.nNodes + 1]; + for (i = 0; i < subnetwork.nNodes; i++) + { + j = node[i]; + subnetwork.nodeWeight[i] = nodeWeight[j]; + for (k = firstNeighborIndex[j]; k < firstNeighborIndex[j + 1]; k++) + if (clustering.cluster[neighbor[k]] == cluster) + { + subnetworkNeighbor[subnetwork.nEdges] = subnetworkNode[neighbor[k]]; + subnetworkEdgeWeight[subnetwork.nEdges] = edgeWeight[k]; + subnetwork.nEdges++; + } + subnetwork.firstNeighborIndex[i + 1] = subnetwork.nEdges; + } + subnetwork.neighbor = Arrays.copyOfRange(subnetworkNeighbor, 0, subnetwork.nEdges); + subnetwork.edgeWeight = Arrays.copyOfRange(subnetworkEdgeWeight, 0, subnetwork.nEdges); + } + + subnetwork.totalEdgeWeightSelfLinks = 0; + + return subnetwork; + } +} diff --git a/java/VOSClusteringTechnique.java b/java/VOSClusteringTechnique.java new file mode 100755 index 000000000..0eee8c084 --- /dev/null +++ b/java/VOSClusteringTechnique.java @@ -0,0 +1,457 @@ +/** + * VOSClusteringTechnique + * + * @author Ludo Waltman + * @author Nees Jan van Eck + * @version 1.3.1, 11/23/14 + */ + +import java.util.Random; + +public class VOSClusteringTechnique +{ + protected Network network; + protected Clustering clustering; + protected double resolution; + + public VOSClusteringTechnique(Network network, double resolution) + { + this.network = network; + clustering = new Clustering(network.nNodes); + clustering.initSingletonClusters(); + this.resolution = resolution; + } + + public VOSClusteringTechnique(Network network, Clustering clustering, double resolution) + { + this.network = network; + this.clustering = clustering; + this.resolution = resolution; + } + + public Network getNetwork() + { + return network; + } + + public Clustering getClustering() + { + return clustering; + } + + public double getResolution() + { + return resolution; + } + + public void setNetwork(Network network) + { + this.network = network; + } + + public void setClustering(Clustering clustering) + { + this.clustering = clustering; + } + + public void setResolution(double resolution) + { + this.resolution = resolution; + } + + public double calcQualityFunction() + { + double qualityFunction; + double[] clusterWeight; + int i, j, k; + + qualityFunction = 0; + + for (i = 0; i < network.nNodes; i++) + { + j = clustering.cluster[i]; + for (k = network.firstNeighborIndex[i]; k < network.firstNeighborIndex[i + 1]; k++) + if (clustering.cluster[network.neighbor[k]] == j) + qualityFunction += network.edgeWeight[k]; + } + qualityFunction += network.totalEdgeWeightSelfLinks; + + clusterWeight = new double[clustering.nClusters]; + for (i = 0; i < network.nNodes; i++) + clusterWeight[clustering.cluster[i]] += network.nodeWeight[i]; + for (i = 0; i < clustering.nClusters; i++) + qualityFunction -= clusterWeight[i] * clusterWeight[i] * resolution; + + qualityFunction /= 2 * network.getTotalEdgeWeight() + network.totalEdgeWeightSelfLinks; + + return qualityFunction; + } + + public boolean runLocalMovingAlgorithm() + { + return runLocalMovingAlgorithm(new Random()); + } + + public boolean runLocalMovingAlgorithm(Random random) + { + boolean update; + double maxQualityFunction, qualityFunction; + double[] clusterWeight, edgeWeightPerCluster; + int bestCluster, i, j, k, l, nNeighboringClusters, nStableNodes, nUnusedClusters; + int[] neighboringCluster, newCluster, nNodesPerCluster, nodePermutation, unusedCluster; + + if (network.nNodes == 1) + return false; + + update = false; + + clusterWeight = new double[network.nNodes]; + nNodesPerCluster = new int[network.nNodes]; + for (i = 0; i < network.nNodes; i++) + { + clusterWeight[clustering.cluster[i]] += network.nodeWeight[i]; + nNodesPerCluster[clustering.cluster[i]]++; + } + + nUnusedClusters = 0; + unusedCluster = new int[network.nNodes]; + for (i = 0; i < network.nNodes; i++) + if (nNodesPerCluster[i] == 0) + { + unusedCluster[nUnusedClusters] = i; + nUnusedClusters++; + } + + nodePermutation = Arrays2.generateRandomPermutation(network.nNodes, random); + + edgeWeightPerCluster = new double[network.nNodes]; + neighboringCluster = new int[network.nNodes - 1]; + nStableNodes = 0; + i = 0; + do + { + j = nodePermutation[i]; + + nNeighboringClusters = 0; + for (k = network.firstNeighborIndex[j]; k < network.firstNeighborIndex[j + 1]; k++) + { + l = clustering.cluster[network.neighbor[k]]; + if (edgeWeightPerCluster[l] == 0) + { + neighboringCluster[nNeighboringClusters] = l; + nNeighboringClusters++; + } + edgeWeightPerCluster[l] += network.edgeWeight[k]; + } + + clusterWeight[clustering.cluster[j]] -= network.nodeWeight[j]; + nNodesPerCluster[clustering.cluster[j]]--; + if (nNodesPerCluster[clustering.cluster[j]] == 0) + { + unusedCluster[nUnusedClusters] = clustering.cluster[j]; + nUnusedClusters++; + } + + bestCluster = -1; + maxQualityFunction = 0; + for (k = 0; k < nNeighboringClusters; k++) + { + l = neighboringCluster[k]; + qualityFunction = edgeWeightPerCluster[l] - network.nodeWeight[j] * clusterWeight[l] * resolution; + if ((qualityFunction > maxQualityFunction) || ((qualityFunction == maxQualityFunction) && (l < bestCluster))) + { + bestCluster = l; + maxQualityFunction = qualityFunction; + } + edgeWeightPerCluster[l] = 0; + } + if (maxQualityFunction == 0) + { + bestCluster = unusedCluster[nUnusedClusters - 1]; + nUnusedClusters--; + } + + clusterWeight[bestCluster] += network.nodeWeight[j]; + nNodesPerCluster[bestCluster]++; + if (bestCluster == clustering.cluster[j]) + nStableNodes++; + else + { + clustering.cluster[j] = bestCluster; + nStableNodes = 1; + update = true; + } + + i = (i < network.nNodes - 1) ? (i + 1) : 0; + } + while (nStableNodes < network.nNodes); + + newCluster = new int[network.nNodes]; + clustering.nClusters = 0; + for (i = 0; i < network.nNodes; i++) + if (nNodesPerCluster[i] > 0) + { + newCluster[i] = clustering.nClusters; + clustering.nClusters++; + } + for (i = 0; i < network.nNodes; i++) + clustering.cluster[i] = newCluster[clustering.cluster[i]]; + + return update; + } + + public boolean runLouvainAlgorithm() + { + return runLouvainAlgorithm(new Random()); + } + + public boolean runLouvainAlgorithm(Random random) + { + boolean update, update2; + VOSClusteringTechnique VOSClusteringTechnique; + + if (network.nNodes == 1) + return false; + + update = runLocalMovingAlgorithm(random); + + if (clustering.nClusters < network.nNodes) + { + VOSClusteringTechnique = new VOSClusteringTechnique(network.createReducedNetwork(clustering), resolution); + + update2 = VOSClusteringTechnique.runLouvainAlgorithm(random); + + if (update2) + { + update = true; + + clustering.mergeClusters(VOSClusteringTechnique.clustering); + } + } + + return update; + } + + public boolean runIteratedLouvainAlgorithm(int maxNIterations) + { + return runIteratedLouvainAlgorithm(maxNIterations, new Random()); + } + + public boolean runIteratedLouvainAlgorithm(int maxNIterations, Random random) + { + boolean update; + int i; + + i = 0; + do + { + update = runLouvainAlgorithm(random); + i++; + } + while ((i < maxNIterations) && update); + return ((i > 1) || update); + } + + public boolean runLouvainAlgorithmWithMultilevelRefinement() + { + return runLouvainAlgorithmWithMultilevelRefinement(new Random()); + } + + public boolean runLouvainAlgorithmWithMultilevelRefinement(Random random) + { + boolean update, update2; + VOSClusteringTechnique VOSClusteringTechnique; + + if (network.nNodes == 1) + return false; + + update = runLocalMovingAlgorithm(random); + + if (clustering.nClusters < network.nNodes) + { + VOSClusteringTechnique = new VOSClusteringTechnique(network.createReducedNetwork(clustering), resolution); + + update2 = VOSClusteringTechnique.runLouvainAlgorithmWithMultilevelRefinement(random); + + if (update2) + { + update = true; + + clustering.mergeClusters(VOSClusteringTechnique.clustering); + + runLocalMovingAlgorithm(random); + } + } + + return update; + } + + public boolean runIteratedLouvainAlgorithmWithMultilevelRefinement(int maxNIterations) + { + return runIteratedLouvainAlgorithmWithMultilevelRefinement(maxNIterations, new Random()); + } + + public boolean runIteratedLouvainAlgorithmWithMultilevelRefinement(int maxNIterations, Random random) + { + boolean update; + int i; + + i = 0; + do + { + update = runLouvainAlgorithmWithMultilevelRefinement(random); + i++; + } + while ((i < maxNIterations) && update); + return ((i > 1) || update); + } + + public boolean runSmartLocalMovingAlgorithm() + { + return runSmartLocalMovingAlgorithm(new Random()); + } + + public boolean runSmartLocalMovingAlgorithm(Random random) + { + boolean update; + int i, j, k; + int[] nNodesPerClusterReducedNetwork; + int[][] nodePerCluster; + Network[] subnetwork; + VOSClusteringTechnique VOSClusteringTechnique; + + if (network.nNodes == 1) + return false; + + update = runLocalMovingAlgorithm(random); + + if (clustering.nClusters < network.nNodes) + { + subnetwork = network.createSubnetworks(clustering); + + nodePerCluster = clustering.getNodesPerCluster(); + + clustering.nClusters = 0; + nNodesPerClusterReducedNetwork = new int[subnetwork.length]; + for (i = 0; i < subnetwork.length; i++) + { + VOSClusteringTechnique = new VOSClusteringTechnique(subnetwork[i], resolution); + + VOSClusteringTechnique.runLocalMovingAlgorithm(random); + + for (j = 0; j < subnetwork[i].nNodes; j++) + clustering.cluster[nodePerCluster[i][j]] = clustering.nClusters + VOSClusteringTechnique.clustering.cluster[j]; + clustering.nClusters += VOSClusteringTechnique.clustering.nClusters; + nNodesPerClusterReducedNetwork[i] = VOSClusteringTechnique.clustering.nClusters; + } + + VOSClusteringTechnique = new VOSClusteringTechnique(network.createReducedNetwork(clustering), resolution); + + i = 0; + for (j = 0; j < nNodesPerClusterReducedNetwork.length; j++) + for (k = 0; k < nNodesPerClusterReducedNetwork[j]; k++) + { + VOSClusteringTechnique.clustering.cluster[i] = j; + i++; + } + VOSClusteringTechnique.clustering.nClusters = nNodesPerClusterReducedNetwork.length; + + update |= VOSClusteringTechnique.runSmartLocalMovingAlgorithm(random); + + clustering.mergeClusters(VOSClusteringTechnique.clustering); + } + + return update; + } + + public boolean runIteratedSmartLocalMovingAlgorithm(int nIterations) + { + return runIteratedSmartLocalMovingAlgorithm(nIterations, new Random()); + } + + public boolean runIteratedSmartLocalMovingAlgorithm(int nIterations, Random random) + { + boolean update; + int i; + + update = false; + for (i = 0; i < nIterations; i++) + update |= runSmartLocalMovingAlgorithm(random); + return update; + } + + public int removeCluster(int cluster) + { + double maxQualityFunction, qualityFunction; + double[] clusterWeight, totalEdgeWeightPerCluster; + int i, j; + + clusterWeight = new double[clustering.nClusters]; + totalEdgeWeightPerCluster = new double[clustering.nClusters]; + for (i = 0; i < network.nNodes; i++) + { + clusterWeight[clustering.cluster[i]] += network.nodeWeight[i]; + if (clustering.cluster[i] == cluster) + for (j = network.firstNeighborIndex[i]; j < network.firstNeighborIndex[i + 1]; j++) + totalEdgeWeightPerCluster[clustering.cluster[network.neighbor[j]]] += network.edgeWeight[j]; + } + + i = -1; + maxQualityFunction = 0; + for (j = 0; j < clustering.nClusters; j++) + if ((j != cluster) && (clusterWeight[j] > 0)) + { + qualityFunction = totalEdgeWeightPerCluster[j] / clusterWeight[j]; + if (qualityFunction > maxQualityFunction) + { + i = j; + maxQualityFunction = qualityFunction; + } + } + + if (i >= 0) + { + for (j = 0; j < network.nNodes; j++) + if (clustering.cluster[j] == cluster) + clustering.cluster[j] = i; + if (cluster == clustering.nClusters - 1) + clustering.nClusters = Arrays2.calcMaximum(clustering.cluster) + 1; + } + + return i; + } + + public void removeSmallClusters(int minNNodesPerCluster) + { + int i, j, k; + int[] nNodesPerCluster; + VOSClusteringTechnique VOSClusteringTechnique; + + VOSClusteringTechnique = new VOSClusteringTechnique(network.createReducedNetwork(clustering), resolution); + + nNodesPerCluster = clustering.getNNodesPerCluster(); + + do + { + i = -1; + j = minNNodesPerCluster; + for (k = 0; k < VOSClusteringTechnique.clustering.nClusters; k++) + if ((nNodesPerCluster[k] > 0) && (nNodesPerCluster[k] < j)) + { + i = k; + j = nNodesPerCluster[k]; + } + + if (i >= 0) + { + j = VOSClusteringTechnique.removeCluster(i); + if (j >= 0) + nNodesPerCluster[j] += nNodesPerCluster[i]; + nNodesPerCluster[i] = 0; + } + } + while (i >= 0); + + clustering.mergeClusters(VOSClusteringTechnique.clustering); + } +} diff --git a/man/AddImputedScore.Rd b/man/AddImputedScore.Rd new file mode 100644 index 000000000..a7a9d3fde --- /dev/null +++ b/man/AddImputedScore.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{AddImputedScore} +\alias{AddImputedScore} +\title{Calculate imputed expression values} +\usage{ +AddImputedScore(object, genes.use = NULL, genes.fit = NULL, s.use = 20, + do.print = FALSE, gram = TRUE) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.use}{A vector of genes (predictors) that can be used for +building the LASSO models.} + +\item{genes.fit}{A vector of genes to impute values for} + +\item{s.use}{Maximum number of steps taken by the algorithm (lower values +indicate a greater degree of smoothing)} + +\item{do.print}{Print progress (output the name of each gene after it has +been imputed).} + +\item{gram}{The use.gram argument passed to lars} +} +\value{ +Returns a Seurat object where the imputed values have been added to +object@imputed +} +\description{ +Uses L1-constrained linear models (LASSO) to impute single cell gene +expression values. +} diff --git a/man/AddMetaData.Rd b/man/AddMetaData.Rd new file mode 100644 index 000000000..0c1498fe4 --- /dev/null +++ b/man/AddMetaData.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{AddMetaData} +\alias{AddMetaData} +\title{Add Metadata} +\usage{ +AddMetaData(object, metadata, col.name = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{metadata}{Data frame where the row names are cell names (note : these +must correspond exactly to the items in object@cell.names), and the columns +are additional metadata items.} + +\item{col.name}{Name for metadata if passing in single vector of information} +} +\value{ +Seurat object where the additional metadata has been added as +columns in object@meta.data +} +\description{ +Adds additional data for single cells to the Seurat object. Can be any piece +of information associated with a cell (examples include read depth, +alignment rate, experimental batch, or subpopulation identity). The +advantage of adding it to the Seurat object is so that it can be +analyzed/visualized using FetchData, VlnPlot, GenePlot, SubsetData, etc. +} diff --git a/man/AddSamples.Rd b/man/AddSamples.Rd new file mode 100644 index 000000000..ac5ec0524 --- /dev/null +++ b/man/AddSamples.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{AddSamples} +\alias{AddSamples} +\title{Add samples into existing Seurat object.} +\usage{ +AddSamples(object, new.data, project = NULL, min.cells = 3, + min.genes = 1000, is.expr = 0, normalization.method = NULL, + scale.factor = 10000, do.scale = TRUE, do.center = TRUE, + names.field = 1, names.delim = "_", meta.data = NULL, save.raw = TRUE, + add.cell.id = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{new.data}{Data matrix for samples to be added} + +\item{project}{Project name (string)} + +\item{min.cells}{Include genes with detected expression in at least this +many cells} + +\item{min.genes}{Include cells where at least this many genes are detected} + +\item{is.expr}{Expression threshold for 'detected' gene} + +\item{normalization.method}{Normalize the data after merging. Default is TRUE. +If set, will perform the same normalization strategy as stored for the first +object} + +\item{scale.factor}{scale factor in the log normalization} + +\item{do.scale}{In object@scale.data, perform row-scaling (gene-based z-score)} + +\item{do.center}{In object@scale.data, perform row-centering (gene-based +centering)} + +\item{names.field}{For the initial identity class for each cell, choose this +field from the cell's column name} + +\item{names.delim}{For the initial identity class for each cell, choose this +delimiter from the cell's column name} + +\item{meta.data}{Additional metadata to add to the Seurat object. Should be +a data frame where the rows are cell names, and the columns are additional +metadata fields} + +\item{save.raw}{TRUE by default. If FALSE, do not save the unmodified data in object@raw.data +which will save memory downstream for large datasets} + +\item{add.cell.id}{String to be appended to the names of all cells in new.data. E.g. if add.cell.id = "rep1", +"cell1" becomes "cell1.rep1"} +} +\description{ +Add samples into existing Seurat object. +} diff --git a/man/AddSmoothedScore.Rd b/man/AddSmoothedScore.Rd new file mode 100644 index 000000000..20aeba0b4 --- /dev/null +++ b/man/AddSmoothedScore.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{AddSmoothedScore} +\alias{AddSmoothedScore} +\title{Calculate smoothed expression values} +\usage{ +AddSmoothedScore(object, genes.fit = NULL, dim.1 = 1, dim.2 = 2, + reduction.use = "tSNE", k = 30, do.log = FALSE, do.print = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.fit}{Genes to calculate smoothed values for} + +\item{dim.1}{Dimension for x-axis (default 1)} + +\item{dim.2}{Dimension for y-axis (default 2)} + +\item{reduction.use}{Which dimensionality reduction to use. Default is +"tsne", can also be "pca", or "ica", assuming these are precomputed.} + +\item{k}{k-param for k-nearest neighbor calculation} + +\item{do.log}{Whether to perform smoothing in log space. Default is false.} + +\item{do.print}{Print progress (output the name of each gene after it has +been imputed).} +} +\description{ +Smooths expression values across the k-nearest neighbors based on dimensional reduction +} diff --git a/man/AlignSubspace.Rd b/man/AlignSubspace.Rd new file mode 100644 index 000000000..a23a975d5 --- /dev/null +++ b/man/AlignSubspace.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction.R +\name{AlignSubspace} +\alias{AlignSubspace} +\title{Align subspaces using dynamic time warping (DTW)} +\usage{ +AlignSubspace(object, reduction.type, grouping.var, dims.align, + num.genes = 30, show.plots = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.type}{reduction to align scores for} + +\item{grouping.var}{Name of the grouping variable for which to align the scores} + +\item{dims.align}{Dims to align, default is all} + +\item{num.genes}{Number of genes to use in construction of "metagene"} + +\item{show.plots}{show debugging plots} +} +\value{ +Returns Seurat object with the dims aligned, stored in + object@dr$reduction.type.aligned +} +\description{ +Aligns subspaces so that they line up across grouping variable (only +implemented for case with 2 categories in grouping.var) +} diff --git a/man/AssessNodes.Rd b/man/AssessNodes.Rd new file mode 100644 index 000000000..10d5d534b --- /dev/null +++ b/man/AssessNodes.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_validation.R +\name{AssessNodes} +\alias{AssessNodes} +\title{Assess Internal Nodes} +\usage{ +AssessNodes(object, node.list, all.below = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{node.list}{List of internal nodes to assess and return} + +\item{all.below}{If single node provided in node.list, assess all splits below (and including) +provided node +.} +} +\value{ +Returns the Out of Bag error for a random forest classifiers trained on +each internal node split or each split provided in the node list. +} +\description{ +Method for automating assessment of tree splits over all internal nodes, +or a provided list of internal nodes. Uses AssessSplit() for calculation +of Out of Bag error (proxy for confidence in split). +} diff --git a/man/AssessSplit.Rd b/man/AssessSplit.Rd new file mode 100644 index 000000000..ee710573c --- /dev/null +++ b/man/AssessSplit.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_validation.R +\name{AssessSplit} +\alias{AssessSplit} +\title{Assess Cluster Split} +\usage{ +AssessSplit(object, node, cluster1, cluster2, print.output = TRUE, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{node}{Node in the cluster tree in question} + +\item{cluster1}{First cluster to compare} + +\item{cluster2}{Second cluster to compare} + +\item{print.output}{Print the OOB error for the classifier} + +\item{...}{Arguments passed on to \code{BuildRFClassifier} +\describe{ + \item{training.genes}{Vector of genes to build the classifier on} + \item{training.classes}{Vector of classes to build the classifier on} + \item{verbose}{Additional progress print statements} +}} +} +\value{ +Returns the Out of Bag error for a random forest classifier +trained on the split from the given node +} +\description{ +Method for determining confidence in specific bifurcations in +the cluster tree. Use the Out of Bag (OOB) error of a random +forest classifier to judge confidence. +} diff --git a/man/AverageDetectionRate.Rd b/man/AverageDetectionRate.Rd new file mode 100644 index 000000000..25d035813 --- /dev/null +++ b/man/AverageDetectionRate.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{AverageDetectionRate} +\alias{AverageDetectionRate} +\title{Probability of detection by identity class} +\usage{ +AverageDetectionRate(object, thresh.min = 0) +} +\arguments{ +\item{object}{Seurat object} + +\item{thresh.min}{Minimum threshold to define 'detected' (log-scale)} +} +\value{ +Returns a matrix with genes as rows, identity classes as columns. +} +\description{ +For each gene, calculates the probability of detection for each identity +class. +} diff --git a/man/AverageExpression.Rd b/man/AverageExpression.Rd new file mode 100644 index 000000000..c7fc2c0ee --- /dev/null +++ b/man/AverageExpression.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{AverageExpression} +\alias{AverageExpression} +\title{Averaged gene expression by identity class} +\usage{ +AverageExpression(object, genes.use = NULL, return.seurat = FALSE, + add.ident = NULL, use.scale = FALSE, use.raw = FALSE, + show.progress = TRUE, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.use}{Genes to analyze. Default is all genes.} + +\item{return.seurat}{Whether to return the data as a Seurat object. Default is false.} + +\item{add.ident}{Place an additional label on each cell prior to averaging (very useful if you want to observe cluster averages, separated by replicate, for example).} + +\item{use.scale}{Use scaled values for gene expression} + +\item{use.raw}{Use raw values for gene expression} + +\item{show.progress}{Show progress bar (default is T)} + +\item{...}{Arguments to be passed to methods such as \code{\link{Seurat}}} +} +\value{ +Returns a matrix with genes as rows, identity classes as columns. +} +\description{ +Returns gene expression for an 'average' single cell in each identity class +} +\details{ +Output is in log-space, but averaging is done in non-log space. +} diff --git a/man/AveragePCA.Rd b/man/AveragePCA.Rd new file mode 100644 index 000000000..d9c868802 --- /dev/null +++ b/man/AveragePCA.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{AveragePCA} +\alias{AveragePCA} +\title{Average PCA scores by identity class} +\usage{ +AveragePCA(object) +} +\arguments{ +\item{object}{Seurat object} +} +\value{ +Returns a matrix with genes as rows, identity classes as columns +} +\description{ +Returns the PCA scores for an 'average' single cell in each identity class +} diff --git a/man/BuildClusterTree.Rd b/man/BuildClusterTree.Rd new file mode 100644 index 000000000..9578e7e62 --- /dev/null +++ b/man/BuildClusterTree.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_determination.R +\name{BuildClusterTree} +\alias{BuildClusterTree} +\title{Phylogenetic Analysis of Identity Classes} +\usage{ +BuildClusterTree(object, genes.use = NULL, pcs.use = NULL, SNN.use = NULL, + do.plot = TRUE, do.reorder = FALSE, reorder.numeric = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.use}{Genes to use for the analysis. Default is the set of +variable genes (object@var.genes). Assumes pcs.use=NULL (tree calculated in +gene expression space)} + +\item{pcs.use}{If set, tree is calculated in PCA space, using the +eigenvalue-WeightedEucleideanDist distance across these PC scores.} + +\item{SNN.use}{If SNN is passed, build tree based on SNN graph connectivity between clusters} + +\item{do.plot}{Plot the resulting phylogenetic tree} + +\item{do.reorder}{Re-order identity classes (factor ordering), according to +position on the tree. This groups similar classes together which can be +helpful, for example, when drawing violin plots.} + +\item{reorder.numeric}{Re-order identity classes according to position on +the tree, assigning a numeric value ('1' is the leftmost node)} +} +\value{ +A Seurat object where the cluster tree is stored in +object@cluster.tree[[1]] +} +\description{ +Constructs a phylogenetic tree relating the 'average' cell from each +identity class. Tree is estimated based on a distance matrix constructed in +either gene expression space or PCA space. +} +\details{ +Note that the tree is calculated for an 'average' cell, so gene expression +or PC scores are averaged across all cells in an identity class before the +tree is constructed. +} diff --git a/man/BuildRFClassifier.Rd b/man/BuildRFClassifier.Rd new file mode 100644 index 000000000..378e22caa --- /dev/null +++ b/man/BuildRFClassifier.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_determination.R +\name{BuildRFClassifier} +\alias{BuildRFClassifier} +\title{Build Random Forest Classifier} +\usage{ +BuildRFClassifier(object, training.genes = NULL, training.classes = NULL, + verbose = TRUE, ...) +} +\arguments{ +\item{object}{Seurat object on which to train the classifier} + +\item{training.genes}{Vector of genes to build the classifier on} + +\item{training.classes}{Vector of classes to build the classifier on} + +\item{verbose}{Additional progress print statements} + +\item{...}{additional parameters passed to ranger} +} +\value{ +Returns the random forest classifier +} +\description{ +Train the random forest classifier +} diff --git a/man/BuildSNN.Rd b/man/BuildSNN.Rd new file mode 100644 index 000000000..358075c6f --- /dev/null +++ b/man/BuildSNN.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/snn.R +\name{BuildSNN} +\alias{BuildSNN} +\title{SNN Graph Construction} +\usage{ +BuildSNN(object, genes.use = NULL, reduction.type = "pca", + dims.use = NULL, k.param = 10, k.scale = 10, plot.SNN = FALSE, + prune.SNN = 1/15, print.output = TRUE, distance.matrix = NULL, + force.recalc = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.use}{A vector of gene names to use in construction of SNN graph +if building directly based on expression data rather than a dimensionally +reduced representation (i.e. PCs).} + +\item{reduction.type}{Name of dimensional reduction technique to use in +construction of SNN graph. (e.g. "pca", "ica")} + +\item{dims.use}{A vector of the dimensions to use in construction of the SNN +graph (e.g. To use the first 10 PCs, pass 1:10)} + +\item{k.param}{Defines k for the k-nearest neighbor algorithm} + +\item{k.scale}{Granularity option for k.param} + +\item{plot.SNN}{Plot the SNN graph} + +\item{prune.SNN}{Sets the cutoff for acceptable Jaccard distances when +computing the neighborhood overlap for the SNN construction. Any edges with +values less than or equal to this will be set to 0 and removed from the SNN +graph. Essentially sets the strigency of pruning (0 --- no pruning, 1 --- +prune everything).} + +\item{print.output}{Whether or not to print output to the console} + +\item{distance.matrix}{Build SNN from distance matrix (experimental)} + +\item{force.recalc}{Force recalculation of SNN.} +} +\value{ +Returns the object with object@snn filled +} +\description{ +Constructs a Shared Nearest Neighbor (SNN) Graph for a given dataset. We +first determine the k-nearest neighbors of each cell (defined by k.param * +k.scale). We use this knn graph to construct the SNN graph by calculating the +neighborhood overlap (Jaccard distance) between every cell and its k.param * +k.scale nearest neighbors (defining the neighborhood for each cell as the +k.param nearest neighbors). +} diff --git a/man/CalcVarExpRatio.Rd b/man/CalcVarExpRatio.Rd new file mode 100644 index 000000000..2ac54f7f6 --- /dev/null +++ b/man/CalcVarExpRatio.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction.R +\name{CalcVarExpRatio} +\alias{CalcVarExpRatio} +\title{Calculate the ratio of variance explained by ICA or PCA to CCA} +\usage{ +CalcVarExpRatio(object, reduction.type = "pca", grouping.var, dims.use) +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.type}{type of dimensional reduction to compare to CCA (pca, +pcafast, ica)} + +\item{grouping.var}{variable to group by} + +\item{dims.use}{Vector of dimensions to project onto (default is the 1:number +stored for cca)} +} +\value{ +Returns Seurat object with ratio of variance explained stored in +object@meta.data$var.ratio +} +\description{ +Calculate the ratio of variance explained by ICA or PCA to CCA +} diff --git a/man/CellPlot.Rd b/man/CellPlot.Rd new file mode 100644 index 000000000..e7d28491f --- /dev/null +++ b/man/CellPlot.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{CellPlot} +\alias{CellPlot} +\title{Cell-cell scatter plot} +\usage{ +CellPlot(object, cell1, cell2, gene.ids = NULL, col.use = "black", + nrpoints.use = Inf, pch.use = 16, cex.use = 0.5, do.hover = FALSE, + do.identify = FALSE, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{cell1}{Cell 1 name (can also be a number, representing the position in +object@cell.names)} + +\item{cell2}{Cell 2 name (can also be a number, representing the position in +object@cell.names)} + +\item{gene.ids}{Genes to plot (default, all genes)} + +\item{col.use}{Colors to use for the points} + +\item{nrpoints.use}{Parameter for smoothScatter} + +\item{pch.use}{Point symbol to use} + +\item{cex.use}{Point size} + +\item{do.hover}{Enable hovering over points to view information} + +\item{do.identify}{Opens a locator session to identify clusters of cells. +points to reveal gene names (hit ESC to stop)} + +\item{\dots}{Additional arguments to pass to smoothScatter} +} +\value{ +No return value (plots a scatter plot) +} +\description{ +Creates a plot of scatter plot of genes across two single cells +} diff --git a/man/ClassifyCells.Rd b/man/ClassifyCells.Rd new file mode 100644 index 000000000..9b84cb693 --- /dev/null +++ b/man/ClassifyCells.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_determination.R +\name{ClassifyCells} +\alias{ClassifyCells} +\title{Classify New Data} +\usage{ +ClassifyCells(object, classifier, training.genes = NULL, + training.classes = NULL, new.data = NULL, ...) +} +\arguments{ +\item{object}{Seurat object on which to train the classifier} + +\item{classifier}{Random Forest classifier from BuildRFClassifier. If not provided, +it will be built from the training data provided.} + +\item{training.genes}{Vector of genes to build the classifier on} + +\item{training.classes}{Vector of classes to build the classifier on} + +\item{new.data}{New data to classify} + +\item{...}{additional parameters passed to ranger} +} +\value{ +Vector of cluster ids +} +\description{ +Classify new data based on the cluster information of the provided object. +Random Forests are used as the basis of the classification. +} diff --git a/man/ColorTSNESplit.Rd b/man/ColorTSNESplit.Rd new file mode 100644 index 000000000..0e2c0d29b --- /dev/null +++ b/man/ColorTSNESplit.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{ColorTSNESplit} +\alias{ColorTSNESplit} +\title{Color tSNE Plot Based on Split} +\usage{ +ColorTSNESplit(object, node, color1 = "red", color2 = "blue", + color3 = "gray", ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{node}{Node in cluster tree on which to base the split} + +\item{color1}{Color for the left side of the split} + +\item{color2}{Color for the right side of the split} + +\item{color3}{Color for all other cells} + +\item{...}{Arguments passed on to \code{TSNEPlot} +\describe{ + \item{do.label}{FALSE by default. If TRUE, plots an alternate view where the center of each +cluster is labeled} + \item{pt.size}{Set the point size} + \item{label.size}{Set the size of the text labels} + \item{cells.use}{Vector of cell names to use in the plot.} + \item{colors.use}{Manually set the color palette to use for the points} +}} +} +\value{ +Returns a tSNE plot +} +\description{ +Returns a tSNE plot colored based on whether the cells fall in clusters +to the left or to the right of a node split in the cluster tree. +} diff --git a/man/CreateSeuratObject.Rd b/man/CreateSeuratObject.Rd new file mode 100644 index 000000000..b39132c2d --- /dev/null +++ b/man/CreateSeuratObject.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{CreateSeuratObject} +\alias{CreateSeuratObject} +\title{Initialize and setup the Seurat object} +\usage{ +CreateSeuratObject(raw.data, project = "SeuratProject", min.cells = 0, + min.genes = 0, is.expr = 0, normalization.method = NULL, + scale.factor = 10000, do.scale = FALSE, do.center = FALSE, + names.field = 1, names.delim = "_", meta.data = NULL, save.raw = TRUE) +} +\arguments{ +\item{raw.data}{Raw input data} + +\item{project}{Project name (string)} + +\item{min.cells}{Include genes with detected expression in at least this +many cells. Will subset the raw.data matrix as well. To reintroduce excluded +genes, create a new object with a lower cutoff.} + +\item{min.genes}{Include cells where at least this many genes are detected.} + +\item{is.expr}{Expression threshold for 'detected' gene. For most datasets, particularly UMI +datasets, will be set to 0 (default). If not, when initializing, this should be set to a level +based on pre-normalized counts (i.e. require at least 5 counts to be treated as expresesd) All +values less than this will be set to 0 (though maintained in object@raw.data).} + +\item{normalization.method}{Method for cell normalization. Default is no normalization. +In this case, run NormalizeData later in the workflow. As a shortcut, you can specify a +normalization method (i.e. LogNormalize) here directly.} + +\item{scale.factor}{If normalizing on the cell level, this sets the scale factor.} + +\item{do.scale}{In object@scale.data, perform row-scaling (gene-based +z-score). FALSE by default. In this case, run ScaleData later in the workflow. As a shortcut, you +can specify do.scale=T (and do.center=T) here.} + +\item{do.center}{In object@scale.data, perform row-centering (gene-based centering)} + +\item{names.field}{For the initial identity class for each cell, choose this field from the +cell's column name} + +\item{names.delim}{For the initial identity class for each cell, choose this delimiter from the +cell's column name} + +\item{meta.data}{Additional metadata to add to the Seurat object. Should be a data frame where +the rows are cell names, and the columns are additional metadata fields} + +\item{save.raw}{TRUE by default. If FALSE, do not save the unmodified data in object@raw.data +which will save memory downstream for large datasets} +} +\value{ +Returns a Seurat object with the raw data stored in object@raw.data. +object@data, object@meta.data, object@ident, also initialized. +} +\description{ +Initializes the Seurat object and some optional filtering +} diff --git a/man/CustomDistance.Rd b/man/CustomDistance.Rd new file mode 100644 index 000000000..6a1a12cc8 --- /dev/null +++ b/man/CustomDistance.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{CustomDistance} +\alias{CustomDistance} +\title{Run a custom distance function on an input data matrix} +\usage{ +CustomDistance(my.mat, my.function, ...) +} +\arguments{ +\item{my.mat}{A matrix to calculate distance on} + +\item{my.function}{A function to calculate distance} + +\item{...}{Extra parameters to my.function} +} +\value{ +A distance matrix +} +\description{ +Run a custom distance function on an input data matrix +} +\author{ +Jean Fan +} diff --git a/man/CustomPalette.Rd b/man/CustomPalette.Rd new file mode 100644 index 000000000..b62de8c78 --- /dev/null +++ b/man/CustomPalette.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting_utilities.R +\name{CustomPalette} +\alias{CustomPalette} +\title{Create a custom color palette} +\usage{ +CustomPalette(low = "white", high = "red", mid = NULL, k = 50) +} +\arguments{ +\item{low}{low color} + +\item{high}{high color} + +\item{mid}{middle color. Optional.} + +\item{k}{number of steps (colors levels) to include between low and high values} +} +\description{ +Creates a custom color palette based on low, middle, and high color values +} diff --git a/man/DBClustDimension.Rd b/man/DBClustDimension.Rd new file mode 100644 index 000000000..2ef33b33c --- /dev/null +++ b/man/DBClustDimension.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_determination.R +\name{DBClustDimension} +\alias{DBClustDimension} +\title{Perform spectral density clustering on single cells} +\usage{ +DBClustDimension(object, dim.1 = 1, dim.2 = 2, reduction.use = "tsne", + G.use = NULL, set.ident = TRUE, seed.use = 1, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{dim.1}{First dimension to use} + +\item{dim.2}{second dimension to use} + +\item{reduction.use}{Which dimensional reduction to use (either 'pca' or 'ica')} + +\item{G.use}{Parameter for the density clustering. Lower value to get more fine-scale clustering} + +\item{set.ident}{TRUE by default. Set identity class to the results of the density clustering. +Unassigned cells (cells that cannot be assigned a cluster) are placed in cluster 1, if there are any.} + +\item{seed.use}{Random seed for the dbscan function} + +\item{...}{Additional arguments to be passed to the dbscan function} +} +\description{ +Find point clounds single cells in a two-dimensional space using density clustering (DBSCAN). +} diff --git a/man/DMEmbed.Rd b/man/DMEmbed.Rd new file mode 100644 index 000000000..47072cc41 --- /dev/null +++ b/man/DMEmbed.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{DMEmbed} +\alias{DMEmbed} +\title{Diffusion Maps Cell Embeddings Accessor Function} +\usage{ +DMEmbed(object, dims.use = NULL, cells.use = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{dims.use}{Dimensions to include (default is all stored dims)} + +\item{cells.use}{Cells to include (default is all cells)} +} +\value{ +Diffusion maps embedding matrix for given cells and DMs +} +\description{ +Pull Diffusion maps cell embedding matrix +} diff --git a/man/DMLoad.Rd b/man/DMLoad.Rd new file mode 100644 index 000000000..8b178552b --- /dev/null +++ b/man/DMLoad.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{DMLoad} +\alias{DMLoad} +\title{Diffusion Maps Gene Loading Accessor Function} +\usage{ +DMLoad(object, dims.use = NULL, genes.use = NULL, use.full = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{dims.use}{Dimensions to include (default is all stored dims)} + +\item{genes.use}{Genes to include (default is all)} + +\item{use.full}{Return projected gene loadings (default is FALSE)#'} +} +\value{ +Diffusion maps gene loading matrix for given genes and DMs +} +\description{ +Pull the diffusion maps gene loadings matrix +} diff --git a/man/DMPlot.Rd b/man/DMPlot.Rd new file mode 100644 index 000000000..f6b33ed9a --- /dev/null +++ b/man/DMPlot.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{DMPlot} +\alias{DMPlot} +\title{Plot Diffusion map} +\usage{ +DMPlot(object, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{\dots}{Additional parameters to DimPlot, for example, which dimensions to plot.} +} +\description{ +Graphs the output of a Diffusion map analysis +Cells are colored by their identity class. +} +\details{ +This function is a wrapper for DimPlot. See ?DimPlot for a full list of possible +arguments which can be passed in here. +} diff --git a/man/DarkTheme.Rd b/man/DarkTheme.Rd new file mode 100644 index 000000000..ddf0ca92e --- /dev/null +++ b/man/DarkTheme.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting_utilities.R +\name{DarkTheme} +\alias{DarkTheme} +\title{Dark Theme} +\usage{ +DarkTheme(...) +} +\arguments{ +\item{...}{Extra parameters to be passed to theme()} +} +\value{ +A ggplot2 theme object +} +\description{ +Add a dark theme to ggplot objects +} +\seealso{ +\code{\link{theme}} +} diff --git a/man/DiffExpTest.Rd b/man/DiffExpTest.Rd new file mode 100644 index 000000000..33efb12ce --- /dev/null +++ b/man/DiffExpTest.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{DiffExpTest} +\alias{DiffExpTest} +\title{Likelihood ratio test for zero-inflated data} +\usage{ +DiffExpTest(object, cells.1, cells.2, genes.use = NULL, print.bar = TRUE) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.1}{Group 1 cells} + +\item{cells.2}{Group 2 cells} + +\item{genes.use}{Genes to test. Default is to use all genes} + +\item{print.bar}{Print a progress bar once expression testing begins (uses pbapply to do this)} +} +\value{ +Returns a p-value ranked matrix of putative differentially expressed +genes. +} +\description{ +Identifies differentially expressed genes between two groups of cells using +the LRT model proposed in McDavid et al, Bioinformatics, 2013 +} diff --git a/man/DiffTTest.Rd b/man/DiffTTest.Rd new file mode 100644 index 000000000..3e8d5c735 --- /dev/null +++ b/man/DiffTTest.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{DiffTTest} +\alias{DiffTTest} +\title{Differential expression testing using Student's t-test} +\usage{ +DiffTTest(object, cells.1, cells.2, genes.use = NULL, print.bar = TRUE) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.1}{Group 1 cells} + +\item{cells.2}{Group 2 cells} + +\item{genes.use}{Genes to test. Default is to use all genes} + +\item{print.bar}{Print a progress bar once expression testing begins (uses pbapply to do this)} +} +\value{ +Returns a p-value ranked matrix of putative differentially expressed +genes. +} +\description{ +Identify differentially expressed genes between two groups of cells using +the Student's t-test +} diff --git a/man/DimElbowPlot.Rd b/man/DimElbowPlot.Rd new file mode 100644 index 000000000..b8b629773 --- /dev/null +++ b/man/DimElbowPlot.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{DimElbowPlot} +\alias{DimElbowPlot} +\title{Quickly Pick Relevant Dimensions} +\usage{ +DimElbowPlot(object, reduction.type = "pca", dims.plot = 20, xlab = "", + ylab = "", title = "") +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.type}{Type of dimensional reduction to plot data for} + +\item{dims.plot}{Number of dimensions to plot sd for} + +\item{xlab}{X axis label} + +\item{ylab}{Y axis label} + +\item{title}{Plot title} +} +\value{ +Returns ggplot object +} +\description{ +Plots the standard deviations (or approximate singular values if running PCAFast) +of the principle components for easy identification of an elbow in the graph. +This elbow often corresponds well with the significant dims and is much faster to run than +Jackstraw +} diff --git a/man/DimHeatmap.Rd b/man/DimHeatmap.Rd new file mode 100644 index 000000000..9fb2d0225 --- /dev/null +++ b/man/DimHeatmap.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{DimHeatmap} +\alias{DimHeatmap} +\title{Dimensional reduction heatmap} +\usage{ +DimHeatmap(object, reduction.type = "pca", dim.use = 1, cells.use = NULL, + num.genes = 30, use.full = FALSE, disp.min = -2.5, disp.max = 2.5, + do.return = FALSE, col.use = pyCols, use.scale = TRUE, + do.balanced = FALSE, remove.key = FALSE, label.columns = NULL, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.use}{A list of cells to plot. If numeric, just plots the top cells.} + +\item{num.genes}{Number of genes to return} + +\item{use.full}{Use the full PCA (projected PCA). Default i s FALSE} + +\item{disp.min}{Minimum display value (all values below are clipped)} + +\item{disp.max}{Maximum display value (all values above are clipped)} + +\item{use.scale}{Default is TRUE: plot scaled data. If FALSE, plot raw data on the heatmap.} + +\item{do.balanced}{Return an equal number of genes with both + and - PC scores.} + +\item{remove.key}{Removes the color key from the plot.} + +\item{label.columns}{Whether to label the columns. Default is TRUE for 1 PC, FALSE for > 1 PC} +} +\value{ +If do.return==TRUE, a matrix of scaled values which would be passed +to heatmap.2. Otherwise, no return value, only a graphical output +} +\description{ +Draws a heatmap focusing on a principal component. Both cells and genes are sorted by their +principal component scores. Allows for nice visualization of sources of heterogeneity in the dataset. +} diff --git a/man/DimPlot.Rd b/man/DimPlot.Rd new file mode 100644 index 000000000..cc4c18ed1 --- /dev/null +++ b/man/DimPlot.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{DimPlot} +\alias{DimPlot} +\title{Dimensional reduction plot} +\usage{ +DimPlot(object, reduction.use = "pca", dim.1 = 1, dim.2 = 2, + cells.use = NULL, pt.size = 3, do.return = FALSE, do.bare = FALSE, + cols.use = NULL, group.by = "ident", pt.shape = NULL, + do.hover = FALSE, data.hover = NULL, do.identify = FALSE, + do.label = FALSE, label.size = 1, no.legend = FALSE, no.axes = FALSE, + dark.theme = FALSE, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.use}{Which dimensionality reduction to use. Default is +"pca", can also be "tsne", or "ica", assuming these are precomputed.} + +\item{dim.1}{Dimension for x-axis (default 1)} + +\item{dim.2}{Dimension for y-axis (default 2)} + +\item{cells.use}{Vector of cells to plot (default is all cells)} + +\item{pt.size}{Adjust point size for plotting} + +\item{do.return}{Return a ggplot2 object (default : FALSE)} + +\item{do.bare}{Do only minimal formatting (default : FALSE)} + +\item{cols.use}{Vector of colors, each color corresponds to an identity +class. By default, ggplot assigns colors.} + +\item{group.by}{Group (color) cells in different ways (for example, orig.ident)} + +\item{pt.shape}{If NULL, all points are circles (default). You can specify any +cell attribute (that can be pulled with FetchData) allowing for both different colors and +different shapes on cells.} + +\item{do.hover}{Enable hovering over points to view information} + +\item{data.hover}{Data to add to the hover, pass a character vector of features to add. Defaults to cell name} + +\item{do.identify}{Opens a locator session to identify clusters of cells.} + +\item{do.label}{Whether to label the clusters} + +\item{label.size}{Sets size of labels} + +\item{no.legend}{Setting to TRUE will remove the legend} + +\item{no.axes}{Setting to TRUE will remove the axes} + +\item{dark.theme}{Use a dark theme for the plot} +} +\value{ +If do.return==TRUE, returns a ggplot2 object. Otherwise, only +graphical output. +} +\description{ +Graphs the output of a dimensional reduction technique (PCA by default). +Cells are colored by their identity class. +} diff --git a/man/DimTopCells.Rd b/man/DimTopCells.Rd new file mode 100644 index 000000000..fc965b61a --- /dev/null +++ b/man/DimTopCells.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{DimTopCells} +\alias{DimTopCells} +\title{Find cells with highest scores for a given dimensional reduction technique} +\usage{ +DimTopCells(object, dim.use = 1, reduction.type = "pca", num.cells = NULL, + do.balanced = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{dim.use}{Components to use} + +\item{reduction.type}{Dimensional reduction to find the highest score for} + +\item{num.cells}{Number of cells to return} + +\item{do.balanced}{Return an equal number of cells with both + and - scores.} +} +\value{ +Returns a vector of cells +} +\description{ +Return a list of genes with the strongest contribution to a set of components +} diff --git a/man/DimTopGenes.Rd b/man/DimTopGenes.Rd new file mode 100644 index 000000000..8c2265d4b --- /dev/null +++ b/man/DimTopGenes.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{DimTopGenes} +\alias{DimTopGenes} +\title{Find genes with highest scores for a given dimensional reduction technique} +\usage{ +DimTopGenes(object, dim.use = 1, reduction.type = "pca", num.genes = 30, + use.full = FALSE, do.balanced = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.type}{Dimensional reduction to find the highest score for} + +\item{num.genes}{Number of genes to return} + +\item{use.full}{Use the full PCA (projected PCA). Default i s FALSE} + +\item{do.balanced}{Return an equal number of genes with both + and - scores.} + +\item{pc.use}{Components to use} +} +\value{ +Returns a vector of genes +} +\description{ +Return a list of genes with the strongest contribution to a set of components +} diff --git a/man/DoHeatmap.Rd b/man/DoHeatmap.Rd new file mode 100644 index 000000000..2ec56b4a9 --- /dev/null +++ b/man/DoHeatmap.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{DoHeatmap} +\alias{DoHeatmap} +\title{Gene expression heatmap} +\usage{ +DoHeatmap(object, data.use = NULL, use.scaled = TRUE, cells.use = NULL, + genes.use = NULL, disp.min = -2.5, disp.max = 2.5, group.by = "ident", + draw.line = TRUE, col.low = "#FF00FF", col.mid = "#000000", + col.high = "#FFFF00", slim.col.label = FALSE, remove.key = FALSE, + rotate.key = FALSE, title = NULL, cex.col = 10, cex.row = 10, + group.label.loc = "bottom", group.label.rot = FALSE, group.cex = 15, + group.spacing = 0.15, do.plot = TRUE, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{data.use}{Option to pass in data to use in the heatmap. Default will pick from either +object@data or object@scale.data depending on use.scaled parameter. Should have cells as columns +and genes as rows.} + +\item{use.scaled}{Whether to use the data or scaled data if data.use is NULL} + +\item{cells.use}{Cells to include in the heatmap (default is all cells)} + +\item{genes.use}{Genes to include in the heatmap (ordered)} + +\item{disp.min}{Minimum display value (all values below are clipped)} + +\item{disp.max}{Maximum display value (all values above are clipped)} + +\item{group.by}{Groups cells by this variable. Default is object@ident} + +\item{draw.line}{Draw vertical lines delineating different groups} + +\item{col.low}{Color for lowest expression value} + +\item{col.mid}{Color for mid expression value} + +\item{col.high}{Color for highest expression value} + +\item{slim.col.label}{display only the identity class name once for each group} + +\item{remove.key}{Removes the color key from the plot.} + +\item{rotate.key}{Rotate color scale horizantally} + +\item{cex.col}{Controls size of column labels (cells)} + +\item{cex.row}{Controls size of row labels (genes)} + +\item{group.label.loc}{Place group labels on bottom or top of plot.} + +\item{group.label.rot}{Whether to rotate the group label.} + +\item{group.cex}{Size of group label text} + +\item{group.spacing}{Controls amount of space between columns.} + +\item{do.plot}{Whether to display the plot.} + +\item{assay.type}{Assay to scale data for. Default is RNA. Can be changed for multimodal analysis} +} +\value{ +Returns a ggplot2 plot object +} +\description{ +Draws a heatmap of single cell gene expression using ggplot2. +} diff --git a/man/DoKMeans.Rd b/man/DoKMeans.Rd new file mode 100644 index 000000000..066f8988f --- /dev/null +++ b/man/DoKMeans.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_determination.R +\name{DoKMeans} +\alias{DoKMeans} +\title{K-Means Clustering} +\usage{ +DoKMeans(object, genes.use = NULL, k.genes = NULL, k.cells = 0, + k.seed = 1, do.plot = FALSE, data.cut = 2.5, k.cols = pyCols, + set.ident = TRUE, do.constrained = FALSE, assay.type = "RNA", ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.use}{Genes to use for clustering} + +\item{k.genes}{K value to use for clustering genes} + +\item{k.cells}{K value to use for clustering cells (default is NULL, cells +are not clustered)} + +\item{k.seed}{Random seed} + +\item{do.plot}{Draw heatmap of clustered genes/cells (default is FALSE).} + +\item{data.cut}{Clip all z-scores to have an absolute value below this. +Reduces the effect of huge outliers in the data.} + +\item{k.cols}{Color palette for heatmap} + +\item{set.ident}{If clustering cells (so k.cells>0), set the cell identity +class to its K-means cluster (default is TRUE)} + +\item{do.constrained}{FALSE by default. If TRUE, use the constrained K-means function implemented in the tclust package.} + +\item{assay.type}{Type of data to normalize for (default is RNA), but can be changed for multimodal analyses.} + +\item{\dots}{Additional parameters passed to kmeans (or tkmeans)} +} +\value{ +Seurat object where the k-means results for genes is stored in + +object@kmeans.obj[[1]], and the k-means results for cells is stored in +object@kmeans.col[[1]]. The cluster for each cell is stored in object@meta.data[,"kmeans.ident"] +and also object@ident (if set.ident=TRUE) +} +\description{ +Perform k=means clustering on both genes and single cells +} +\details{ +K-means and heatmap are calculated on object@scale.data +} diff --git a/man/DotPlot.Rd b/man/DotPlot.Rd new file mode 100644 index 000000000..0a454232d --- /dev/null +++ b/man/DotPlot.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{DotPlot} +\alias{DotPlot} +\title{Dot plot visualization} +\usage{ +DotPlot(object, genes.plot, cex.use = 2, cols.use = NULL, + thresh.col = 2.5, dot.min = 0.05, group.by = NULL, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.plot}{Input vector of genes} + +\item{cex.use}{Scaling factor for the dots (scales all dot sizes)} + +\item{cols.use}{colors to plot} + +\item{thresh.col}{The raw data value which corresponds to a red dot (lowest expression)} + +\item{dot.min}{The fraction of cells at which to draw the smallest dot (default is 0.05)} + +\item{group.by}{Factor to group the cells by} +} +\value{ +Only graphical output +} +\description{ +Intuitive way of visualizing how gene expression changes across different identity classes (clusters). +The size of the dot encodes the percentage of cells within a class, while the color encodes the +AverageExpression level of 'expressing' cells (green is high). +} diff --git a/man/DotPlotGG.Rd b/man/DotPlotGG.Rd new file mode 100644 index 000000000..8082d30c0 --- /dev/null +++ b/man/DotPlotGG.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{DotPlotGG} +\alias{DotPlotGG} +\title{Dot plot visualization} +\usage{ +DotPlotGG(object, genes.plot, cols.use = c("green", "red"), col.min = -2.5, + col.max = 2.5, dot.min = 0, dot.scale = 6, group.by, + plot.legend = FALSE, do.return = FALSE, x.lab.rot = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.plot}{Input vector of genes} + +\item{cols.use}{colors to plot} + +\item{col.min}{Minimum scaled average expression threshold (everything smaller will be set to this)} + +\item{col.max}{Maximum scaled average expression threshold (everything larger will be set to this)} + +\item{dot.min}{The fraction of cells at which to draw the smallest dot (default is 0.05).} + +\item{dot.scale}{Scale the size of the points, similar to cex} + +\item{group.by}{Factor to group the cells by} + +\item{plot.legend}{plots the legends} + +\item{do.return}{Return ggplot2 object} + +\item{x.lab.rot}{Rotate x-axis labels} +} +\value{ +default, no return, only graphical output. If do.return=TRUE, returns a ggplot2 object +} +\description{ +Intuitive way of visualizing how gene expression changes across different identity classes (clusters). +The size of the dot encodes the percentage of cells within a class, while the color encodes the +AverageExpression level of 'expressing' cells (green is high). +} diff --git a/man/ExpMean.Rd b/man/ExpMean.Rd new file mode 100644 index 000000000..1a554faf4 --- /dev/null +++ b/man/ExpMean.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{ExpMean} +\alias{ExpMean} +\title{Calculate the mean of logged values} +\usage{ +ExpMean(x) +} +\arguments{ +\item{x}{value or vector of values} +} +\value{ +Returns the mean in log-space +} +\description{ +Calculate mean of logged values in non-log space (return answer in log-space) +} diff --git a/man/ExpSD.Rd b/man/ExpSD.Rd new file mode 100644 index 000000000..286ad4e07 --- /dev/null +++ b/man/ExpSD.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{ExpSD} +\alias{ExpSD} +\title{Calculate the standard deviation of logged values} +\usage{ +ExpSD(x) +} +\arguments{ +\item{x}{value or vector of values} +} +\value{ +Returns the standard deviation in log-space +} +\description{ +Calculate SD of logged values in non-log space (return answer in log-space) +} diff --git a/man/ExpVar.Rd b/man/ExpVar.Rd new file mode 100644 index 000000000..7b39c0b0d --- /dev/null +++ b/man/ExpVar.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{ExpVar} +\alias{ExpVar} +\title{Calculate the variance of logged values} +\usage{ +ExpVar(x) +} +\arguments{ +\item{x}{value or vector of values} +} +\value{ +Returns the variance in log-space +} +\description{ +Calculate variance of logged values in non-log space (return answer in +log-space) +} diff --git a/man/ExtractField.Rd b/man/ExtractField.Rd new file mode 100644 index 000000000..ffbed08db --- /dev/null +++ b/man/ExtractField.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{ExtractField} +\alias{ExtractField} +\title{Extract delimiter information from a string.} +\usage{ +ExtractField(string, field = 1, delim = "_") +} +\arguments{ +\item{string}{String to parse.} + +\item{field}{Integer(s) indicating which field(s) to extract. Can be a vector multiple numbers.} + +\item{delim}{Delimiter to use, set to underscore by default.} +} +\value{ +A new string, that parses out the requested fields, and (if multiple), rejoins them with the same delimiter +} +\description{ +Parses a string (usually a cell name) and extracts fields based on a delimiter +} diff --git a/man/FastWhichCells.Rd b/man/FastWhichCells.Rd new file mode 100644 index 000000000..713574e13 --- /dev/null +++ b/man/FastWhichCells.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{FastWhichCells} +\alias{FastWhichCells} +\title{FastWhichCells +Identify cells matching certain criteria (limited to character values)} +\usage{ +FastWhichCells(object, group.by, subset.value, invert = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{group.by}{Group cells in different ways (for example, orig.ident). Should be a column name in object@meta.data} + +\item{subset.value}{Return cells matching this value} + +\item{invert}{invert cells to return.FALSE by default} +} +\description{ +FastWhichCells +Identify cells matching certain criteria (limited to character values) +} diff --git a/man/FeatureHeatmap.Rd b/man/FeatureHeatmap.Rd new file mode 100644 index 000000000..7c13dda44 --- /dev/null +++ b/man/FeatureHeatmap.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{FeatureHeatmap} +\alias{FeatureHeatmap} +\title{Vizualization of multiple features} +\usage{ +FeatureHeatmap(object, features.plot, dim.1 = 1, dim.2 = 2, + idents.use = NULL, pt.size = 2, cols.use = c("grey", "red"), + pch.use = 16, reduction.use = "tsne", group.by = NULL, + sep.scale = FALSE, do.return = FALSE, min.exp = -Inf, max.exp = Inf, + rotate.key = FALSE, plot.horiz = FALSE, key.position = "right") +} +\arguments{ +\item{object}{Seurat object} + +\item{features.plot}{Vector of features to plot} + +\item{dim.1}{Dimension for x-axis (default 1)} + +\item{dim.2}{Dimension for y-axis (default 2)} + +\item{idents.use}{Which identity classes to display (default is all identity +classes)} + +\item{pt.size}{Adjust point size for plotting} + +\item{cols.use}{Ordered vector of colors to use for plotting. Default is +heat.colors(10).} + +\item{pch.use}{Pch for plotting} + +\item{reduction.use}{Which dimensionality reduction to use. Default is +"tsne", can also be "pca", or "ica", assuming these are precomputed.} + +\item{group.by}{Group cells in different ways (for example, orig.ident)} + +\item{sep.scale}{Scale each group separately. Default is FALSE.} + +\item{do.return}{Return the ggplot2 object} + +\item{min.exp}{Min cutoff for scaled expression value} + +\item{max.exp}{Max cutoff for scaled expression value} + +\item{rotate.key}{rotate the legend} + +\item{plot.horiz}{rotate the plot such that the features are columns, groups are the rows} + +\item{key.position}{position of the legend ("top", "right", "bottom", "left")} +} +\value{ +No return value, only a graphical output +} +\description{ +Similar to FeaturePlot, however, also splits the plot by visualizing each +identity class separately. +} +\details{ +Particularly useful for seeing if the same groups of cells co-exhibit a +common feature (i.e. co-express a gene), even within an identity class. Best +understood by example. +} diff --git a/man/FeatureLocator.Rd b/man/FeatureLocator.Rd new file mode 100644 index 000000000..777c5c5f4 --- /dev/null +++ b/man/FeatureLocator.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting_utilities.R +\name{FeatureLocator} +\alias{FeatureLocator} +\title{Feature Locator} +\usage{ +FeatureLocator(plot, data.plot, ...) +} +\arguments{ +\item{plot}{A ggplot2 plot} + +\item{data.plot}{The oridinal data that went into the ggplot2 plot} + +\item{...}{Extra parameters, such as dark.theme, recolor, or smooth for using a dark theme, +recoloring based on selected cells, or using a smooth scatterplot, respectively} +} +\value{ +The names of the points selected +} +\description{ +Select points on a scatterplot and get information about them +} +\seealso{ +\code{\link{locator}} + +\code{\link{ggplot2::ggplot_build}} +} diff --git a/man/FeaturePlot.Rd b/man/FeaturePlot.Rd new file mode 100644 index 000000000..64e4c5476 --- /dev/null +++ b/man/FeaturePlot.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{FeaturePlot} +\alias{FeaturePlot} +\title{Visualize 'features' on a dimensional reduction plot} +\usage{ +FeaturePlot(object, features.plot, min.cutoff = NA, max.cutoff = NA, + dim.1 = 1, dim.2 = 2, cells.use = NULL, pt.size = 1, + cols.use = c("yellow", "red"), pch.use = 16, overlay = FALSE, + do.hover = FALSE, data.hover = NULL, do.identify = FALSE, + reduction.use = "tsne", use.imputed = FALSE, nCol = NULL, + no.axes = FALSE, no.legend = TRUE, dark.theme = FALSE, + do.return = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{features.plot}{Vector of features to plot} + +\item{min.cutoff}{Vector of minimum cutoff values for each feature, may specify quantile in the form of 'q##' where '##' is the quantile (eg, 1, 10)} + +\item{max.cutoff}{Vector of maximum cutoff values for each feature, may specify quantile in the form of 'q##' where '##' is the quantile (eg, 1, 10)} + +\item{dim.1}{Dimension for x-axis (default 1)} + +\item{dim.2}{Dimension for y-axis (default 2)} + +\item{cells.use}{Vector of cells to plot (default is all cells)} + +\item{pt.size}{Adjust point size for plotting} + +\item{cols.use}{The two colors to form the gradient over. Provide as string vector with +the first color corresponding to low values, the second to high. Also accepts a Brewer +color scale or vector of colors. Note: this will bin the data into number of colors provided.} + +\item{pch.use}{Pch for plotting} + +\item{overlay}{Plot two features overlayed one on top of the other} + +\item{do.hover}{Enable hovering over points to view information} + +\item{data.hover}{Data to add to the hover, pass a character vector of features to add. Defaults to cell name} + +\item{do.identify}{Opens a locator session to identify clusters of cells} + +\item{reduction.use}{Which dimensionality reduction to use. Default is +"tsne", can also be "pca", or "ica", assuming these are precomputed.} + +\item{use.imputed}{Use imputed values for gene expression (default is FALSE)} + +\item{nCol}{Number of columns to use when plotting multiple features.} + +\item{no.axes}{Remove axis labels} + +\item{no.legend}{Remove legend from the graph. Default is TRUE.} + +\item{dark.theme}{Plot in a dark theme} + +\item{do.return}{return the ggplot2 object} +} +\value{ +No return value, only a graphical output +} +\description{ +Colors single cells on a dimensional reduction plot according to a 'feature' +(i.e. gene expression, PC scores, number of genes detected, etc.) +} diff --git a/man/FetchData.Rd b/man/FetchData.Rd new file mode 100644 index 000000000..e28ddf638 --- /dev/null +++ b/man/FetchData.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{FetchData} +\alias{FetchData} +\title{Access cellular data} +\usage{ +FetchData(object, vars.all = NULL, cells.use = NULL, use.imputed = FALSE, + use.scaled = FALSE, use.raw = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{vars.all}{List of all variables to fetch} + +\item{cells.use}{Cells to collect data for (default is all cells)} + +\item{use.imputed}{For gene expression, use imputed values. Default is FALSE} + +\item{use.scaled}{For gene expression, use scaled values. Default is FALSE} + +\item{use.raw}{For gene expression, use raw values. Default is FALSE} +} +\value{ +A data frame with cells as rows and cellular data as columns +} +\description{ +Retreives data (gene expression, PCA scores, etc, metrics, etc.) for a set +of cells in a Seurat object +} diff --git a/man/FilterCells.Rd b/man/FilterCells.Rd new file mode 100644 index 000000000..e87447812 --- /dev/null +++ b/man/FilterCells.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{FilterCells} +\alias{FilterCells} +\title{Return a subset of the Seurat object} +\usage{ +FilterCells(object, subset.names, low.thresholds, high.thresholds, + cells.use = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{subset.names}{Parameters to subset on. Eg, the name of a gene, PC1, a +column name in object@meta.data, etc. Any argument that can be retreived +using FetchData} + +\item{low.thresholds}{Low cutoffs for the parameters (default is -Inf)} + +\item{high.thresholds}{High cutoffs for the parameters (default is Inf)} + +\item{cells.use}{A vector of cell names to use as a subset} +} +\value{ +Returns a Seurat object containing only the relevant subset of cells +} +\description{ +Creates a Seurat object containing only a subset of the cells in the +original object. Takes either a list of cells to use as a subset, or a +parameter (for example, a gene), to subset on. +} diff --git a/man/FindAllMarkers.Rd b/man/FindAllMarkers.Rd new file mode 100644 index 000000000..3055b476a --- /dev/null +++ b/man/FindAllMarkers.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{FindAllMarkers} +\alias{FindAllMarkers} +\title{Gene expression markers for all identity classes} +\usage{ +FindAllMarkers(object, genes.use = NULL, thresh.use = 0.25, + test.use = "bimod", min.pct = 0.1, min.diff.pct = 0.05, + print.bar = TRUE, only.pos = FALSE, max.cells.per.ident = Inf, + return.thresh = 0.01, do.print = FALSE, random.seed = 1, + min.cells = 3, latent.vars = "nUMI") +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.use}{Genes to test. Default is to all genes} + +\item{thresh.use}{Limit testing to genes which show, on average, at least +X-fold difference (log-scale) between the two groups of cells. +Increasing thresh.use speeds up the function, but can miss weaker signals.} + +\item{test.use}{Denotes which test to use. Seurat currently implements +"bimod" (likelihood-ratio test for single cell gene expression, McDavid et +al., Bioinformatics, 2013, default), "roc" (standard AUC classifier), "t" +(Students t-test), and "tobit" (Tobit-test for differential gene expression, +as in Trapnell et al., Nature Biotech, 2014), 'poisson', and 'negbinom'. +The latter two options should only be used on UMI datasets, and assume an underlying +poisson or negative-binomial distribution} + +\item{min.pct}{- only test genes that are detected in a minimum fraction of min.pct cells +in either of the two populations. Meant to speed up the function by not testing genes that are very infrequently expression} + +\item{min.diff.pct}{- only test genes that show a minimum difference in the fraction of detection between the two groups. Set to -Inf by default} + +\item{print.bar}{Print a progress bar once expression testing begins (uses pbapply to do this)} + +\item{only.pos}{Only return positive markers (FALSE by default)} + +\item{max.cells.per.ident}{Down sample each identity class to a max number. Default is no downsampling.} + +\item{return.thresh}{Only return markers that have a p-value < return.thresh, or a power > return.thresh (if the test is ROC)} + +\item{do.print}{FALSE by default. If TRUE, outputs updates on progress.} + +\item{random.seed}{Random seed for downsampling} + +\item{min.cells}{Minimum number of cells expressing the gene in at least one of the two groups} + +\item{latent.vars}{remove the effects of these variables} +} +\value{ +Matrix containing a ranked list of putative markers, and associated +statistics (p-values, ROC score, etc.) +} +\description{ +Finds markers (differentially expressed genes) for each of the identity classes in a dataset +} diff --git a/man/FindAllMarkersNode.Rd b/man/FindAllMarkersNode.Rd new file mode 100644 index 000000000..d6f76c327 --- /dev/null +++ b/man/FindAllMarkersNode.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{FindAllMarkersNode} +\alias{FindAllMarkersNode} +\title{Find all markers for a node} +\usage{ +FindAllMarkersNode(object, node = NULL, genes.use = NULL, + thresh.use = 0.25, test.use = "bimod", min.pct = 0.1, + min.diff.pct = 0.05, print.bar = TRUE, only.pos = FALSE, + max.cells.per.ident = Inf, return.thresh = 0.01, do.print = FALSE, + random.seed = 1, min.cells = 3) +} +\arguments{ +\item{object}{Seurat object. Must have object@cluster.tree slot filled. Use BuildClusterTree() if not.} + +\item{node}{Node from which to start identifying split markers, default is top node.} + +\item{genes.use}{Genes to test. Default is to use all genes} + +\item{thresh.use}{Limit testing to genes which show, on average, at least +X-fold difference (log-scale) between the two groups of cells.} + +\item{test.use}{Denotes which test to use. Seurat currently implements +"bimod" (likelihood-ratio test for single cell gene expression, McDavid et +al., Bioinformatics, 2013, default), "roc" (standard AUC classifier), "t" +(Students t-test), and "tobit" (Tobit-test for differential gene expression, +as in Trapnell et al., Nature Biotech, 2014), 'poisson', and 'negbinom'. +The latter two options should only be used on UMI datasets, and assume an underlying +poisson or negative-binomial distribution.} + +\item{min.pct}{- only test genes that are detected in a minimum fraction of min.pct cells +in either of the two populations. Meant to speed up the function by not testing genes that are very infrequently expression} + +\item{min.diff.pct}{- only test genes that show a minimum difference in the fraction of detection between the two groups. Set to -Inf by default} + +\item{print.bar}{Print a progress bar once expression testing begins (uses pbapply to do this)} + +\item{only.pos}{Only return positive markers (FALSE by default)} + +\item{max.cells.per.ident}{Down sample each identity class to a max number. Default is no downsampling.} + +\item{return.thresh}{Only return markers that have a p-value < return.thresh, or a power > return.thresh (if the test is ROC)} + +\item{random.seed}{Random seed for downsampling} + +\item{min.cells}{Minimum number of cells expressing the gene in at least one of the two groups} +} +\value{ +Returns a dataframe with a ranked list of putative markers for each node and associated statistics +} +\description{ +This function finds markers for all splits at or below the specified node +} diff --git a/man/FindClusters.Rd b/man/FindClusters.Rd new file mode 100644 index 000000000..4ac5bfe5a --- /dev/null +++ b/man/FindClusters.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_determination.R +\name{FindClusters} +\alias{FindClusters} +\title{Cluster Determination} +\usage{ +FindClusters(object, genes.use = NULL, reduction.type = "pca", + dims.use = NULL, k.param = 30, k.scale = 25, plot.SNN = FALSE, + prune.SNN = 1/15, print.output = TRUE, distance.matrix = NULL, + save.SNN = FALSE, reuse.SNN = FALSE, force.recalc = FALSE, + modularity.fxn = 1, resolution = 0.8, algorithm = 1, n.start = 100, + n.iter = 10, random.seed = 0, temp.file.location = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.use}{A vector of gene names to use in construction of SNN graph +if building directly based on expression data rather than a dimensionally +reduced representation (i.e. PCs).} + +\item{reduction.type}{Name of dimensional reduction technique to use in +construction of SNN graph. (e.g. "pca", "ica")} + +\item{dims.use}{A vector of the dimensions to use in construction of the SNN +graph (e.g. To use the first 10 PCs, pass 1:10)} + +\item{k.param}{Defines k for the k-nearest neighbor algorithm} + +\item{k.scale}{Granularity option for k.param} + +\item{plot.SNN}{Plot the SNN graph} + +\item{prune.SNN}{Sets the cutoff for acceptable Jaccard distances when +computing the neighborhood overlap for the SNN construction. Any edges with +values less than or equal to this will be set to 0 and removed from the SNN +graph. Essentially sets the strigency of pruning (0 --- no pruning, 1 --- +prune everything).} + +\item{print.output}{Whether or not to print output to the console} + +\item{distance.matrix}{Build SNN from distance matrix (experimental)} + +\item{save.SNN}{Saves the SNN matrix associated with the calculation in +object@snn} + +\item{reuse.SNN}{Force utilization of stored SNN. If none store, this will +throw an error.} + +\item{force.recalc}{Force recalculation of SNN.} + +\item{modularity.fxn}{Modularity function (1 = standard; 2 = alternative).} + +\item{resolution}{Value of the resolution parameter, use a value above +(below) 1.0 if you want to obtain a larger (smaller) number of communities.} + +\item{algorithm}{Algorithm for modularity optimization (1 = original Louvain +algorithm; 2 = Louvain algorithm with multilevel refinement; 3 = SLM +algorithm).} + +\item{n.start}{Number of random starts.} + +\item{n.iter}{Maximal number of iterations per random start.} + +\item{random.seed}{Seed of the random number generator.} + +\item{temp.file.location}{Directory where intermediate files will be written. +Specify the ABSOLUTE path.} +} +\value{ +Returns a Seurat object and optionally the SNN matrix, + object@ident has been updated with new cluster info +} +\description{ +Identify clusters of cells by a shared nearest neighbor (SNN) modularity +optimization based clustering algorithm. First calculate k-nearest neighbors +and construct the SNN graph. Then optimize the modularity function to +determine clusters. For a full description of the algorithms, see Waltman and +van Eck (2013) \emph{The European Physical Journal B}. +} diff --git a/man/FindConservedMarkers.Rd b/man/FindConservedMarkers.Rd new file mode 100644 index 000000000..c61de6c45 --- /dev/null +++ b/man/FindConservedMarkers.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{FindConservedMarkers} +\alias{FindConservedMarkers} +\title{Finds markers that are conserved between the two groups} +\usage{ +FindConservedMarkers(object, ident.1, ident.2 = NULL, grouping.var, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{ident.1}{Identity class to define markers for} + +\item{ident.2}{A second identity class for comparison. If NULL (default) - use all other cells +for comparison.} + +\item{grouping.var}{grouping variable} + +\item{\dots}{parameters to pass to FindMarkers} +} +\value{ +Matrix containing a ranked list of putative conserved markers, and associated statistics +(p-values within each group and a combined p-value (fisher_pval), percentage of cells expressing +the marker, average differences) +} +\description{ +Finds markers that are conserved between the two groups +} diff --git a/man/FindMarkers.Rd b/man/FindMarkers.Rd new file mode 100644 index 000000000..08dd67417 --- /dev/null +++ b/man/FindMarkers.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{FindMarkers} +\alias{FindMarkers} +\title{Gene expression markers of identity classes} +\usage{ +FindMarkers(object, ident.1, ident.2 = NULL, genes.use = NULL, + thresh.use = 0.25, test.use = "bimod", min.pct = 0.1, + min.diff.pct = -Inf, print.bar = TRUE, only.pos = FALSE, + max.cells.per.ident = Inf, random.seed = 1, latent.vars = "nUMI", + min.cells = 3) +} +\arguments{ +\item{object}{Seurat object} + +\item{ident.1}{Identity class to define markers for} + +\item{ident.2}{A second identity class for comparison. If NULL (default) - +use all other cells for comparison.} + +\item{genes.use}{Genes to test. Default is to use all genes} + +\item{thresh.use}{Limit testing to genes which show, on average, at least +X-fold difference (log-scale) between the two groups of cells. Default is 0.25 +Increasing thresh.use speeds up the function, but can miss weaker signals.} + +\item{test.use}{Denotes which test to use. Seurat currently implements +"bimod" (likelihood-ratio test for single cell gene expression, McDavid et +al., Bioinformatics, 2013, default), "roc" (standard AUC classifier), "t" +(Students t-test), and "tobit" (Tobit-test for differential gene expression, +as in Trapnell et al., Nature Biotech, 2014), 'poisson', and 'negbinom'. +The latter two options should only be used on UMI datasets, and assume an underlying +poisson or negative-binomial distribution} + +\item{min.pct}{- only test genes that are detected in a minimum fraction of min.pct cells +in either of the two populations. Meant to speed up the function by not testing genes that are very infrequently expressed. Default is 0.1} + +\item{min.diff.pct}{- only test genes that show a minimum difference in the fraction of detection between the two groups. Set to -Inf by default} + +\item{print.bar}{Print a progress bar once expression testing begins (uses pbapply to do this)} + +\item{only.pos}{Only return positive markers (FALSE by default)} + +\item{max.cells.per.ident}{Down sample each identity class to a max number. Default is no downsampling. Not activated by default (set to Inf)} + +\item{random.seed}{Random seed for downsampling} + +\item{min.cells}{Minimum number of cells expressing the gene in at least one of the two groups} +} +\value{ +Matrix containing a ranked list of putative markers, and associated statistics (p-values, ROC score, etc.) +} +\description{ +Finds markers (differentially expressed genes) for identity classes +} diff --git a/man/FindMarkersNode.Rd b/man/FindMarkersNode.Rd new file mode 100644 index 000000000..b210dc1cb --- /dev/null +++ b/man/FindMarkersNode.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{FindMarkersNode} +\alias{FindMarkersNode} +\title{Gene expression markers of identity classes defined by a phylogenetic clade} +\usage{ +FindMarkersNode(object, node, tree.use = NULL, genes.use = NULL, + thresh.use = 0.25, test.use = "bimod", ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{node}{The node in the phylogenetic tree to use as a branch point} + +\item{tree.use}{Can optionally pass the tree to be used. Default uses the tree in object@cluster.tree} + +\item{genes.use}{Genes to test. Default is to use all genes} + +\item{thresh.use}{Limit testing to genes which show, on average, at least +X-fold difference (log-scale) between the two groups of cells. Default is 0.25 +Increasing thresh.use speeds up the function, but can miss weaker signals.} + +\item{test.use}{Denotes which test to use. Seurat currently implements +"bimod" (likelihood-ratio test for single cell gene expression, McDavid et +al., Bioinformatics, 2013, default), "roc" (standard AUC classifier), "t" +(Students t-test), and "tobit" (Tobit-test for differential gene expression, +as in Trapnell et al., Nature Biotech, 2014), 'poisson', and 'negbinom'. +The latter two options should only be used on UMI datasets, and assume an underlying +poisson or negative-binomial distribution} + +\item{...}{Additional arguments passed to FindMarkers} +} +\value{ +Matrix containing a ranked list of putative markers, and associated +statistics (p-values, ROC score, etc.) +} +\description{ +Finds markers (differentially expressed genes) based on a branching point (node) in +the phylogenetic tree. Markers that define clusters in the left branch are positive markers. +Markers that define the right branch are negative markers. +} diff --git a/man/FindVariableGenes.Rd b/man/FindVariableGenes.Rd new file mode 100644 index 000000000..d8979d078 --- /dev/null +++ b/man/FindVariableGenes.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{FindVariableGenes} +\alias{FindVariableGenes} +\title{Identify variable genes} +\usage{ +FindVariableGenes(object, mean.function = ExpMean, + dispersion.function = LogVMR, do.plot = TRUE, set.var.genes = TRUE, + x.low.cutoff = 0.1, x.high.cutoff = 8, y.cutoff = 1, + y.high.cutoff = Inf, num.bin = 20, do.recalc = TRUE, + sort.results = TRUE, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{mean.function}{Function to compute x-axis value (average expression). Default +is to take the mean of the detected (i.e. non-zero) values} + +\item{dispersion.function}{Function to compute y-axis value (dispersion). Default is to +take the standard deviation of all values/} + +\item{do.plot}{Plot the average/dispersion relationship} + +\item{set.var.genes}{Set object@var.genes to the identified variable genes +(default is TRUE)} + +\item{x.low.cutoff}{Bottom cutoff on x-axis for identifying variable genes} + +\item{x.high.cutoff}{Top cutoff on x-axis for identifying variable genes} + +\item{y.cutoff}{Bottom cutoff on y-axis for identifying variable genes} + +\item{y.high.cutoff}{Top cutoff on y-axis for identifying variable genes} + +\item{num.bin}{Total number of bins to use in the scaled analysis (default +is 20)} + +\item{do.recalc}{TRUE by default. If FALSE, plots and selects variable genes without recalculating statistics for each gene.} + +\item{sort.results}{If TRUE (by default), sort results in object@hvg.info in decreasing order of dispersion} + +\item{...}{Extra parameters to VariableGenePlot} +} +\value{ +Returns a Seurat object, placing variable genes in object@var.genes. +The result of all analysis is stored in object@hvg.info +} +\description{ +Identifies genes that are outliers on a 'mean variability plot'. First, uses +a function to calculate average expression (mean.function) and dispersion (dispersion.function) +for each gene. Next, divides genes into num.bin (deafult 20) bins based on +their average expression, and calculates z-scores for dispersion within each +bin. The purpose of this is to identify variable genes while controlling for +the strong relationship between variability and average expression. +} +\details{ +Exact parameter settings may vary empirically from dataset to dataset, and +based on visual inspection of the plot. +Setting the y.cutoff parameter to 2 identifies genes that are more than two standard +deviations away from the average dispersion within a bin. The default X-axis function +is the mean expression level, and for Y-axis it is the log(Variance/mean). All mean/variance +calculations are not performed in log-space, but the results are reported in log-space - +see relevant functions for exact details. +} +\seealso{ +\code{\link{VariableGenePlot}} +} diff --git a/man/FitGeneK.Rd b/man/FitGeneK.Rd new file mode 100644 index 000000000..25100ef66 --- /dev/null +++ b/man/FitGeneK.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/spatial.R +\name{FitGeneK} +\alias{FitGeneK} +\title{Build mixture models of gene expression} +\usage{ +FitGeneK(object, gene, do.k = 2, num.iter = 1, do.plot = FALSE, + genes.use = NULL, start.pct = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{gene}{Gene to fit} + +\item{do.k}{Number of modes for the mixture model (default is 2)} + +\item{num.iter}{Number of 'greedy k-means' iterations (default is 1)} + +\item{do.plot}{Plot mixture model results} + +\item{genes.use}{Genes to use in the greedy k-means step (See manuscript for details)} + +\item{start.pct}{Initial estimates of the percentage of cells in the 'on' +state (usually estimated from the in situ map)} +} +\value{ +A Seurat object, where the posterior of each cell being in the 'on' +or 'off' state for each gene is stored in object@spatial@mix.probs +} +\description{ +Models the imputed gene expression values as a mixture of gaussian +distributions. For a two-state model, estimates the probability that a given +cell is in the 'on' or 'off' state for any gene. Followed by a greedy +k-means step where cells are allowed to flip states based on the overall +structure of the data (see Manuscript for details) +} diff --git a/man/GenePlot.Rd b/man/GenePlot.Rd new file mode 100644 index 000000000..8952a180c --- /dev/null +++ b/man/GenePlot.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{GenePlot} +\alias{GenePlot} +\title{Scatter plot of single cell data} +\usage{ +GenePlot(object, gene1, gene2, cell.ids = NULL, col.use = NULL, + pch.use = 16, cex.use = 1.5, use.imputed = FALSE, use.scaled = FALSE, + use.raw = FALSE, do.hover = FALSE, data.hover = NULL, + do.identify = FALSE, dark.theme = FALSE, do.spline = FALSE, + spline.span = 0.75, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{gene1}{First feature to plot. Typically gene expression but can also +be metrics, PC scores, etc. - anything that can be retreived with FetchData} + +\item{gene2}{Second feature to plot.} + +\item{cell.ids}{Cells to include on the scatter plot.} + +\item{col.use}{Colors to use for identity class plotting.} + +\item{pch.use}{Pch argument for plotting} + +\item{cex.use}{Cex argument for plotting} + +\item{use.imputed}{Use imputed values for gene expression (Default is FALSE)} + +\item{use.scaled}{Use scaled data} + +\item{use.raw}{Use raw data} + +\item{do.hover}{Enable hovering over points to view information} + +\item{data.hover}{Data to add to the hover, pass a character vector of features to add. Defaults to cell name} + +\item{do.identify}{Opens a locator session to identify clusters of cells.} + +\item{dark.theme}{Use a dark theme for the plot} + +\item{do.spline}{Add a spline (currently hardwired to df=4, to be improved)} + +\item{spline.span}{spline span in loess function call} + +\item{\dots}{Additional arguments to be passed to plot.} +} +\value{ +No return, only graphical output +} +\description{ +Creates a scatter plot of two features (typically gene expression), across a +set of single cells. Cells are colored by their identity class. +} diff --git a/man/GenesInCluster.Rd b/man/GenesInCluster.Rd new file mode 100644 index 000000000..7ed7c2572 --- /dev/null +++ b/man/GenesInCluster.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{GenesInCluster} +\alias{GenesInCluster} +\title{GenesInCluster} +\usage{ +GenesInCluster(object, cluster.num, max.genes = 1e+06) +} +\arguments{ +\item{object}{Seurat object. Assumes DoKMeans has already been run} + +\item{cluster.num}{K-means cluster(s) to return genes for} + +\item{max.genes}{max number of genes to return} +} +\value{ +A vector of genes who are members in the cluster.num k-means cluster(s) +} +\description{ +After k-means analysis, previously run with DoKMeans, returns a set of genes associated with each cluster +} diff --git a/man/GetAssayData.Rd b/man/GetAssayData.Rd new file mode 100644 index 000000000..2657cac03 --- /dev/null +++ b/man/GetAssayData.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multi_modal.R +\name{GetAssayData} +\alias{GetAssayData} +\title{Accessor function for multimodal data} +\usage{ +GetAssayData(object, assay.type = "RNA", slot = "data") +} +\arguments{ +\item{object}{Seurat object} + +\item{assay.type}{Type of assay to fetch data for (default is RNA)} + +\item{slot}{Specific information to pull (i.e. raw.data, data, scale.data,...). Default is data} +} +\value{ +Returns assay data +} +\description{ +Pull information for specified stored dimensional reduction analysis +} diff --git a/man/GetCellEmbeddings.Rd b/man/GetCellEmbeddings.Rd new file mode 100644 index 000000000..22fb636dc --- /dev/null +++ b/man/GetCellEmbeddings.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{GetCellEmbeddings} +\alias{GetCellEmbeddings} +\title{Dimensional Reduction Cell Embeddings Accessor Function} +\usage{ +GetCellEmbeddings(object, reduction.type = "pca", dims.use = NULL, + cells.use = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.type}{Type of dimensional reduction to fetch (default is PCA)} + +\item{dims.use}{Dimensions to include (default is all stored dims)} + +\item{cells.use}{Cells to include (default is all cells)} +} +\value{ +Cell embedding matrix for given reduction, cells, and dimensions +} +\description{ +Pull cell embeddings matrix for specified stored dimensional reduction +analysis +} diff --git a/man/GetCentroids.Rd b/man/GetCentroids.Rd new file mode 100644 index 000000000..35bf2d798 --- /dev/null +++ b/man/GetCentroids.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/spatial.R +\name{GetCentroids} +\alias{GetCentroids} +\title{Get cell centroids} +\usage{ +GetCentroids(object, cells.use = NULL, get.exact = TRUE) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.use}{Cells to calculate centroids for (default is all cells)} + +\item{get.exact}{Get exact centroid (Default is TRUE). If FALSE, identify +the single closest bin.} +} +\value{ +Data frame containing the x and y coordinates for each cell +centroid. +} +\description{ +Calculate the spatial mapping centroids for each cell, based on previously +calculated mapping probabilities for each bin. +} +\details{ +Currently, Seurat assumes that the tissue of interest has an 8x8 bin +structure. This will be broadened in a future release. +} diff --git a/man/GetClusters.Rd b/man/GetClusters.Rd new file mode 100644 index 000000000..0b0e20ee3 --- /dev/null +++ b/man/GetClusters.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_determination.R +\name{GetClusters} +\alias{GetClusters} +\title{Get Cluster Assignments} +\usage{ +GetClusters(object) +} +\arguments{ +\item{object}{Seurat object with cluster assignments} +} +\value{ +Returns a dataframe with cell names and cluster assignments +} +\description{ +Retrieve cluster IDs as a dataframe. First column will be the cell name, +second column will be the current cluster identity (pulled from object@ident). +} diff --git a/man/GetDimReduction.Rd b/man/GetDimReduction.Rd new file mode 100644 index 000000000..94c42ef34 --- /dev/null +++ b/man/GetDimReduction.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{GetDimReduction} +\alias{GetDimReduction} +\title{Dimensional Reduction Accessor Function} +\usage{ +GetDimReduction(object, reduction.type = "pca", slot = "gene.loadings") +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.type}{Type of dimensional reduction to fetch (default is PCA)} + +\item{slot}{Specific information to pull (must be one of the following: +"cell.embeddings", "gene.loadings", "gene.loadings.full", "sdev", "key", "misc")} +} +\value{ +Returns specified slot results from given reduction technique +} +\description{ +General accessor function for dimensional reduction objects. Pulls slot +contents for specified stored dimensional reduction analysis. +} diff --git a/man/GetGeneLoadings.Rd b/man/GetGeneLoadings.Rd new file mode 100644 index 000000000..663038e87 --- /dev/null +++ b/man/GetGeneLoadings.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{GetGeneLoadings} +\alias{GetGeneLoadings} +\title{Dimensional Reduction Gene Loadings Accessor Function} +\usage{ +GetGeneLoadings(object, reduction.type = "pca", dims.use = NULL, + genes.use = NULL, use.full = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.type}{Type of dimensional reduction to fetch (default is PCA)} + +\item{dims.use}{Dimensions to include (default is all stored dims)} + +\item{genes.use}{Genes to include (default is all genes)} + +\item{use.full}{Return projected gene loadings (default is FALSE)} +} +\value{ +Gene loading matrix for given reduction, cells, and genes +} +\description{ +Pull gene loadings matrix for specified stored dimensional reduction analysis. +} diff --git a/man/HoverLocator.Rd b/man/HoverLocator.Rd new file mode 100644 index 000000000..f2ccdc522 --- /dev/null +++ b/man/HoverLocator.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting_utilities.R +\name{HoverLocator} +\alias{HoverLocator} +\title{Hover Locator} +\usage{ +HoverLocator(plot, data.plot, features.info = NULL, dark.theme = FALSE, ...) +} +\arguments{ +\item{plot}{A ggplot2 plot} + +\item{data.plot}{The oridinal data that went into the ggplot2 plot} + +\item{features.info}{An optional dataframe or matrix of extra information to be displayed on hover} + +\item{dark.theme}{Plot using a dark theme?} + +\item{...}{Extra parameters to be passed to plotly::layout} +} +\description{ +Get quick information from a scatterplot by hovering over points +} +\seealso{ +\code{\link{plotly::layout}} + +\code{\link{ggplot2::ggplot_build}} +} diff --git a/man/ICAEmbed.Rd b/man/ICAEmbed.Rd new file mode 100644 index 000000000..aab6e3132 --- /dev/null +++ b/man/ICAEmbed.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{ICAEmbed} +\alias{ICAEmbed} +\title{ICA Cell Embeddings Accessor Function} +\usage{ +ICAEmbed(object, dims.use = NULL, cells.use = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{dims.use}{Dimensions to include (default is all stored dims)} + +\item{cells.use}{Cells to include (default is all cells)} +} +\value{ +ICA cell embeddings matrix for given cells and ICs +} +\description{ +Pull ICA cell embeddings matrix +} diff --git a/man/ICALoad.Rd b/man/ICALoad.Rd new file mode 100644 index 000000000..35df036d2 --- /dev/null +++ b/man/ICALoad.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{ICALoad} +\alias{ICALoad} +\title{ICA Gene Loadings Accessor Function} +\usage{ +ICALoad(object, dims.use = NULL, genes.use = NULL, use.full = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{dims.use}{Dimensions to include (default is all stored dims)} + +\item{genes.use}{Genes to include (default is all)} +} +\value{ +ICA gene loading matrix for given genes and ICs +} +\description{ +Pull the ICA gene loadings matrix +} diff --git a/man/ICAPlot.Rd b/man/ICAPlot.Rd new file mode 100644 index 000000000..74c59110a --- /dev/null +++ b/man/ICAPlot.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{ICAPlot} +\alias{ICAPlot} +\title{Plot ICA map} +\usage{ +ICAPlot(object, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{\dots}{Additional parameters to DimPlot, for example, which dimensions to plot.} +} +\description{ +Graphs the output of a ICA analysis +Cells are colored by their identity class. +} +\details{ +This function is a wrapper for DimPlot. See ?DimPlot for a full list of possible +arguments which can be passed in here. +} diff --git a/man/ICHeatmap.Rd b/man/ICHeatmap.Rd new file mode 100644 index 000000000..1ae2eebc7 --- /dev/null +++ b/man/ICHeatmap.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{ICHeatmap} +\alias{ICHeatmap} +\title{Independent component heatmap} +\usage{ +ICHeatmap(object, ic.use = 1, cells.use = NULL, num.genes = 30, + disp.min = -2.5, disp.max = 2.5, do.return = FALSE, col.use = pyCols, + use.scale = TRUE, do.balanced = FALSE, remove.key = FALSE, + label.columns = NULL, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{ic.use}{Independent components to use} + +\item{cells.use}{Cells to include in the heatmap (default is all cells)} + +\item{num.genes}{Number of genes to return} + +\item{disp.min}{Minimum display value (all values below are clipped)} + +\item{disp.max}{Maximum display value (all values above are clipped)} + +\item{use.scale}{Default is TRUE: plot scaled data. If FALSE, plot raw data on the heatmap.} + +\item{do.balanced}{Return an equal number of genes with both + and - IC scores.} + +\item{remove.key}{Removes the color key from the plot.} +} +\value{ +If do.return==TRUE, a matrix of scaled values which would be passed +to heatmap.2. Otherwise, no return value, only a graphical output +} +\description{ +Draws a heatmap focusing on a principal component. Both cells and genes are sorted by their +principal component scores. Allows for nice visualization of sources of heterogeneity +in the dataset."() +} diff --git a/man/ICTopCells.Rd b/man/ICTopCells.Rd new file mode 100644 index 000000000..790f92a24 --- /dev/null +++ b/man/ICTopCells.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{ICTopCells} +\alias{ICTopCells} +\title{Find cells with highest ICA scores} +\usage{ +ICTopCells(object, ic.use = 1, num.cells = NULL, do.balanced = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{ic.use}{Independent component to use} + +\item{num.cells}{Number of cells to return} + +\item{do.balanced}{Return an equal number of cells with both + and - PC scores.} +} +\value{ +Returns a vector of cells +} +\description{ +Return a list of genes with the strongest contribution to a set of principal +components +} diff --git a/man/ICTopGenes.Rd b/man/ICTopGenes.Rd new file mode 100644 index 000000000..9e9631b8a --- /dev/null +++ b/man/ICTopGenes.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{ICTopGenes} +\alias{ICTopGenes} +\title{Find genes with highest ICA scores} +\usage{ +ICTopGenes(object, ic.use = 1, num.genes = 30, use.full = FALSE, + do.balanced = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{ic.use}{Independent components to use} + +\item{num.genes}{Number of genes to return} + +\item{do.balanced}{Return an equal number of genes with both + and - IC scores.} +} +\value{ +Returns a vector of genes +} +\description{ +Return a list of genes with the strongest contribution to a set of +indepdendent components +} diff --git a/man/InitialMapping.Rd b/man/InitialMapping.Rd new file mode 100644 index 000000000..cf3db9c2a --- /dev/null +++ b/man/InitialMapping.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/spatial.R +\name{InitialMapping} +\alias{InitialMapping} +\title{Infer spatial origins for single cells} +\usage{ +InitialMapping(object, cells.use = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.use}{Which cells to map} +} +\value{ +Seurat object, where mapping probabilities for each bin are stored +in object@final.prob +} +\description{ +Probabilistically maps single cells based on (imputed) gene expression +estimates, a set of mixture models, and an in situ spatial reference map. +} diff --git a/man/JackStraw.Rd b/man/JackStraw.Rd new file mode 100644 index 000000000..4fe7e2604 --- /dev/null +++ b/man/JackStraw.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/jackstraw.R +\name{JackStraw} +\alias{JackStraw} +\title{Determine statistical significance of PCA scores.} +\usage{ +JackStraw(object, num.pc = 20, num.replicate = 100, prop.freq = 0.01, + do.print = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{num.pc}{Number of PCs to compute significance for} + +\item{num.replicate}{Number of replicate samplings to perform} + +\item{prop.freq}{Proportion of the data to randomly permute for each +replicate} + +\item{do.print}{Print the number of replicates that have been processed.} +} +\value{ +Returns a Seurat object where object@jackStraw.empP represents +p-values for each gene in the PCA analysis. If ProjectPCA is subsequently +run, object@jackStraw.empP.full then represents p-values for all genes. +} +\description{ +Randomly permutes a subset of data, and calculates projected PCA scores for +these 'random' genes. Then compares the PCA scores for the 'random' genes +with the observed PCA scores to determine statistical signifance. End result +is a p-value for each gene's association with each principal component. +} +\references{ +Inspired by Chung et al, Bioinformatics (2014) +} diff --git a/man/JackStrawPlot.Rd b/man/JackStrawPlot.Rd new file mode 100644 index 000000000..317408a6f --- /dev/null +++ b/man/JackStrawPlot.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{JackStrawPlot} +\alias{JackStrawPlot} +\title{JackStraw Plot} +\usage{ +JackStrawPlot(object, PCs = 1:5, nCol = 3, score.thresh = 1e-05, + plot.x.lim = 0.1, plot.y.lim = 0.3) +} +\arguments{ +\item{object}{Seurat plot} + +\item{PCs}{Which PCs to examine} + +\item{nCol}{Number of columns} + +\item{score.thresh}{Threshold to use for the proportion test of PC +significance (see Details)} + +\item{plot.x.lim}{X-axis maximum on each QQ plot.} + +\item{plot.y.lim}{Y-axis maximum on each QQ plot.} +} +\value{ +A ggplot object +} +\description{ +Plots the results of the JackStraw analysis for PCA significance. For each +PC, plots a QQ-plot comparing the distribution of p-values for all genes +across each PC, compared with a uniform distribution. Also determines a +p-value for the overall significance of each PC (see Details). +} +\details{ +Significant PCs should show a p-value distribution (black curve) that is +strongly skewed to the left compared to the null distribution (dashed line) +The p-value for each PC is based on a proportion test comparing the number +of genes with a p-value below a particular threshold (score.thresh), compared with the +proportion of genes expected under a uniform distribution of p-values. +} +\author{ +Thanks to Omri Wurtzel for integrating with ggplot +} diff --git a/man/KClustDimension.Rd b/man/KClustDimension.Rd new file mode 100644 index 000000000..5f6d9bbef --- /dev/null +++ b/man/KClustDimension.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_determination.R +\name{KClustDimension} +\alias{KClustDimension} +\title{Perform spectral k-means clustering on single cells} +\usage{ +KClustDimension(object, dims.use = c(1, 2), cells.use = NULL, pt.size = 4, + reduction.use = "tsne", k.use = 5, set.ident = T, seed.use = 1, ...) +} +\description{ +Find point clounds single cells in a low-dimensional space using k-means clustering. +Can be useful for smaller datasets, where graph-based clustering can perform poorly +TODO : add documentation here +} diff --git a/man/KMeansHeatmap.Rd b/man/KMeansHeatmap.Rd new file mode 100644 index 000000000..80639c01a --- /dev/null +++ b/man/KMeansHeatmap.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{KMeansHeatmap} +\alias{KMeansHeatmap} +\title{Plot k-means clusters} +\usage{ +KMeansHeatmap(object, cells.use = object@cell.names, genes.cluster = NULL, + max.genes = 1e+06, slim.col.label = TRUE, remove.key = TRUE, + row.lines = TRUE, ...) +} +\arguments{ +\item{object}{A Seurat object} + +\item{cells.use}{Cells to include in the heatmap} + +\item{genes.cluster}{Clusters to include in heatmap} + +\item{max.genes}{Maximum number of genes to include in the heatmap} + +\item{remove.key}{Removes teh color key from the plot} + +\item{row.lines}{Color separations of clusters} + +\item{...}{Extra parameters to DoHeatmap} + +\item{slim.col.labels}{Instead of displaying every cell name on the heatmap, +display only the identity class name once for each group} +} +\description{ +Plot k-means clusters +} +\seealso{ +\code{\link{DoHeatmap}} +} diff --git a/man/LogNormalize.Rd b/man/LogNormalize.Rd new file mode 100644 index 000000000..345e8c975 --- /dev/null +++ b/man/LogNormalize.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{LogNormalize} +\alias{LogNormalize} +\title{Normalize raw data} +\usage{ +LogNormalize(data, scale.factor = 10000, display.progress = TRUE) +} +\arguments{ +\item{data}{Matrix with the raw count data} + +\item{scale.factor}{Scale the data. Default is 1e4} + +\item{display.progress}{Print progress} +} +\value{ +Returns a matrix with the normalize and log transformed data +} +\description{ +Normalize count data per cell and transform to log scale +} diff --git a/man/LogVMR.Rd b/man/LogVMR.Rd new file mode 100644 index 000000000..d9f6aaf4b --- /dev/null +++ b/man/LogVMR.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{LogVMR} +\alias{LogVMR} +\title{Calculate the variance to mean ratio of logged values} +\usage{ +LogVMR(x) +} +\arguments{ +\item{x}{value or vector of values} +} +\value{ +Returns the VMR in log-space +} +\description{ +Calculate the variance to mean ratio (VMR) in non-logspace (return answer in +log-space) +} diff --git a/man/MakeSparse.Rd b/man/MakeSparse.Rd new file mode 100644 index 000000000..d2cec6870 --- /dev/null +++ b/man/MakeSparse.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{MakeSparse} +\alias{MakeSparse} +\title{Make object sparse} +\usage{ +MakeSparse(object) +} +\arguments{ +\item{object}{Seurat object} +} +\value{ +Returns a seurat object with data converted to sparse matrices. +} +\description{ +Converts stored data matrices to sparse matrices to save space. Converts +object@raw.data and object@data to sparse matrices. +} diff --git a/man/MarkerTest.Rd b/man/MarkerTest.Rd new file mode 100644 index 000000000..38887e90e --- /dev/null +++ b/man/MarkerTest.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{MarkerTest} +\alias{MarkerTest} +\title{ROC-based marker discovery} +\usage{ +MarkerTest(object, cells.1, cells.2, genes.use = NULL, print.bar = TRUE) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.1}{Group 1 cells} + +\item{cells.2}{Group 2 cells} + +\item{genes.use}{Genes to test. Default is to use all genes} + +\item{print.bar}{Print a progress bar once expression testing begins (uses pbapply to do this)} +} +\value{ +Returns a 'predictive power' (abs(AUC-0.5)) ranked matrix of +putative differentially expressed genes. +} +\description{ +Identifies 'markers' of gene expression using ROC analysis. For each gene, +evaluates (using AUC) a classifier built on that gene alone, to classify +between two groups of cells. +} +\details{ +An AUC value of 1 means that expression values for this gene alone can +perfectly classify the two groupings (i.e. Each of the cells in cells.1 +exhibit a higher level than each of the cells in cells.2). An AUC value of 0 +also means there is perfect classification, but in the other direction. A +value of 0.5 implies that the gene has no predictive power to classify the +two groups. +} diff --git a/man/MatrixRowShuffle.Rd b/man/MatrixRowShuffle.Rd new file mode 100644 index 000000000..e9c238a66 --- /dev/null +++ b/man/MatrixRowShuffle.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{MatrixRowShuffle} +\alias{MatrixRowShuffle} +\title{Independently shuffle values within each row of a matrix} +\usage{ +MatrixRowShuffle(x) +} +\arguments{ +\item{x}{Matrix to shuffle} +} +\value{ +Returns a scrambled matrix, where each row is shuffled independently +} +\description{ +Creates a matrix where correlation structure has been removed, but overall values are the same +} diff --git a/man/MergeNode.Rd b/man/MergeNode.Rd new file mode 100644 index 000000000..0ecbcdb99 --- /dev/null +++ b/man/MergeNode.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{MergeNode} +\alias{MergeNode} +\title{Merge childen of a node} +\usage{ +MergeNode(object, node.use = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{node.use}{Merge children of this node} +} +\description{ +Merge the childen of a node into a single identity class +} diff --git a/man/MergeSeurat.Rd b/man/MergeSeurat.Rd new file mode 100644 index 000000000..a35bbaa61 --- /dev/null +++ b/man/MergeSeurat.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{MergeSeurat} +\alias{MergeSeurat} +\title{Merge Seurat Objects} +\usage{ +MergeSeurat(object1, object2, project = NULL, min.cells = 0, + min.genes = 0, is.expr = 0, do.normalize = TRUE, scale.factor = 10000, + do.scale = FALSE, do.center = FALSE, names.field = 1, + names.delim = "_", save.raw = TRUE, add.cell.id1 = NULL, + add.cell.id2 = NULL) +} +\arguments{ +\item{object1}{First Seurat object to merge} + +\item{object2}{Second Seurat object to merge} + +\item{min.cells}{Include genes with detected expression in at least this +many cells} + +\item{min.genes}{Include cells where at least this many genes are detected} + +\item{is.expr}{Expression threshold for 'detected' gene} + +\item{do.scale}{In object@scale.data, perform row-scaling (gene-based +z-score). FALSE by default, so run ScaleData after merging.} + +\item{do.center}{In object@scale.data, perform row-centering (gene-based +centering). FALSE by default} + +\item{names.field}{For the initial identity class for each cell, choose this +field from the cell's column name} + +\item{names.delim}{For the initial identity class for each cell, choose this +delimiter from the cell's column name} + +\item{save.raw}{TRUE by default. If FALSE, do not save the unmodified data in object@raw.data +which will save memory downstream for large datasets} + +\item{add.cell.id1}{String to be appended to the names of all cells in object1} + +\item{add.cell.id2}{String to be appended to the names of all cells in object2} + +\item{normalization.method}{Normalize the data after merging. Default is TRUE. +If set, will perform the same normalization strategy as stored for the first object} + +\item{meta.data}{Additional metadata to add to the Seurat object. Should be +a data frame where the rows are cell names, and the columns are additional +metadata fields} +} +\value{ +Merged Seurat object +} +\description{ +Merge two Seurat objects +} diff --git a/man/MinMax.Rd b/man/MinMax.Rd new file mode 100644 index 000000000..bcd784596 --- /dev/null +++ b/man/MinMax.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{MinMax} +\alias{MinMax} +\title{Apply a ceiling and floor to all values in a matrix} +\usage{ +MinMax(data, min, max) +} +\arguments{ +\item{data}{Matrix or data frame} + +\item{min}{all values below this min value will be replaced with min} + +\item{max}{all values above this max value will be replaced with max} +} +\value{ +Returns matrix after performing these floor and ceil operations +} +\description{ +Apply a ceiling and floor to all values in a matrix +} diff --git a/man/NegBinomDETest.Rd b/man/NegBinomDETest.Rd new file mode 100644 index 000000000..96f61c0d0 --- /dev/null +++ b/man/NegBinomDETest.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{NegBinomDETest} +\alias{NegBinomDETest} +\title{Negative binomial test for UMI-count based data} +\usage{ +NegBinomDETest(object, cells.1, cells.2, genes.use = NULL, + latent.vars = NULL, print.bar = TRUE, min.cells = 3) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.1}{Group 1 cells} + +\item{cells.2}{Group 2 cells} + +\item{genes.use}{Genes to test. Default is to use all genes} + +\item{print.bar}{Print a progress bar once expression testing begins (uses pbapply to do this)} + +\item{min.cells}{Minimum number of cells expressing the gene in at least one of the two groups} +} +\value{ +Returns a p-value ranked matrix of putative differentially expressed +genes. +} +\description{ +Identifies differentially expressed genes between two groups of cells using +a negative binomial generalized linear model +} diff --git a/man/NegBinomRegDETest.Rd b/man/NegBinomRegDETest.Rd new file mode 100644 index 000000000..968d15656 --- /dev/null +++ b/man/NegBinomRegDETest.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{NegBinomRegDETest} +\alias{NegBinomRegDETest} +\title{Negative binomial test for UMI-count based data (regularized version)} +\usage{ +NegBinomRegDETest(object, cells.1, cells.2, genes.use = NULL, + latent.vars = NULL, print.bar = TRUE, min.cells = 3) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.1}{Group 1 cells} + +\item{cells.2}{Group 2 cells} + +\item{genes.use}{Genes to test. Default is to use all genes} + +\item{print.bar}{Print a progress bar once expression testing begins (uses pbapply to do this)} + +\item{min.cells}{Minimum number of cells expressing the gene in at least one of the two groups} +} +\value{ +Returns a p-value ranked data frame of test results. +} +\description{ +Identifies differentially expressed genes between two groups of cells using +a likelihood ratio test of negative binomial generalized linear models where +the overdispersion parameter theta is determined by pooling information +across genes. +} diff --git a/man/NodeHeatmap.Rd b/man/NodeHeatmap.Rd new file mode 100644 index 000000000..afbf3e9b7 --- /dev/null +++ b/man/NodeHeatmap.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{NodeHeatmap} +\alias{NodeHeatmap} +\title{Node Heatmap} +\usage{ +NodeHeatmap(object, marker.list, node = NULL, max.genes = 10, ...) +} +\arguments{ +\item{object}{Seurat object. Must have the cluster.tree slot filled (use BuildClusterTree)} + +\item{marker.list}{List of marker genes given from the FindAllMarkersNode function} + +\item{node}{Node in the cluster tree from which to start the plot, defaults to highest node in marker list} + +\item{max.genes}{Maximum number of genes to keep for each division} + +\item{...}{Additional parameters to pass to DoHeatmap} +} +\value{ +Plots heatmap. No return value. +} +\description{ +Takes an object, a marker list (output of FindAllMarkers), and a node +and plots a heatmap where genes are ordered vertically by the splits present +in the object@cluster.tree slot. +} diff --git a/man/NormalizeData.Rd b/man/NormalizeData.Rd new file mode 100644 index 000000000..483b24833 --- /dev/null +++ b/man/NormalizeData.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{NormalizeData} +\alias{NormalizeData} +\title{Normalize Assay Data} +\usage{ +NormalizeData(object, assay.type = "RNA", + normalization.method = "LogNormalize", scale.factor = 10000, + display.progress = TRUE) +} +\arguments{ +\item{object}{Seurat object} + +\item{assay.type}{Type of assay to normalize for (default is RNA), but can be +changed for multimodal analyses.} + +\item{normalization.method}{Method for normalization. Default is +log-normalization (LogNormalize). More methods to be added very shortly.} + +\item{scale.factor}{Sets the scale factor for cell-level normalization} + +\item{display.progress}{display progress bar for scaling procedure.} +} +\value{ +Returns object after normalization. Normalized data is stored in data +or scale.data slot, depending on the method +} +\description{ +Normalize data for a given assay +} diff --git a/man/NumberClusters.Rd b/man/NumberClusters.Rd new file mode 100644 index 000000000..76d741cca --- /dev/null +++ b/man/NumberClusters.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_determination.R +\name{NumberClusters} +\alias{NumberClusters} +\title{Convert the cluster labels to a numeric representation} +\usage{ +NumberClusters(object) +} +\arguments{ +\item{object}{Seurat object} +} +\value{ +Returns a Seurat object with the identities relabeled numerically +starting from 1. +} +\description{ +Convert the cluster labels to a numeric representation +} diff --git a/man/OldDoHeatmap.Rd b/man/OldDoHeatmap.Rd new file mode 100644 index 000000000..48d7827eb --- /dev/null +++ b/man/OldDoHeatmap.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{OldDoHeatmap} +\alias{OldDoHeatmap} +\title{Gene expression heatmap} +\usage{ +OldDoHeatmap(object, cells.use = NULL, genes.use = NULL, disp.min = NULL, + disp.max = NULL, draw.line = TRUE, do.return = FALSE, + order.by.ident = TRUE, col.use = pyCols, slim.col.label = FALSE, + group.by = NULL, remove.key = FALSE, cex.col = NULL, do.scale = TRUE, + ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.use}{Cells to include in the heatmap (default is all cells)} + +\item{genes.use}{Genes to include in the heatmap (ordered)} + +\item{disp.min}{Minimum display value (all values below are clipped)} + +\item{disp.max}{Maximum display value (all values above are clipped)} + +\item{draw.line}{Draw vertical lines delineating cells in different identity +classes.} + +\item{do.return}{Default is FALSE. If TRUE, return a matrix of scaled values +which would be passed to heatmap.2} + +\item{order.by.ident}{Order cells in the heatmap by identity class (default +is TRUE). If FALSE, cells are ordered based on their order in cells.use} + +\item{col.use}{Color palette to use} + +\item{slim.col.label}{if (order.by.ident==TRUE) then instead of displaying +every cell name on the heatmap, display only the identity class name once +for each group} + +\item{group.by}{If (order.by.ident==TRUE) default, you can group cells in +different ways (for example, orig.ident)} + +\item{remove.key}{Removes the color key from the plot.} + +\item{cex.col}{positive numbers, used as cex.axis in for the column axis labeling. +The defaults currently only use number of columns} + +\item{do.scale}{whether to use the data or scaled data} + +\item{...}{Additional parameters to heatmap.2. Common examples are cexRow +and cexCol, which set row and column text sizes} +} +\value{ +If do.return==TRUE, a matrix of scaled values which would be passed +to heatmap.2. Otherwise, no return value, only a graphical output +} +\description{ +Draws a heatmap of single cell gene expression using the heatmap.2 function. Has been replaced by the ggplot2 +version (now in DoHeatmap), but kept for legacy +} diff --git a/man/PCAEmbed.Rd b/man/PCAEmbed.Rd new file mode 100644 index 000000000..4548b2b86 --- /dev/null +++ b/man/PCAEmbed.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{PCAEmbed} +\alias{PCAEmbed} +\title{PCA Cell Embeddings Accessor Function} +\usage{ +PCAEmbed(object, dims.use = NULL, cells.use = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{dims.use}{Dimensions to include (default is all stored dims)} + +\item{cells.use}{Cells to include (default is all cells)} +} +\value{ +PCA cell embedding matrix for given cells and PCs +} +\description{ +Pull PCA cell embedding matrix +} diff --git a/man/PCALoad.Rd b/man/PCALoad.Rd new file mode 100644 index 000000000..01d671904 --- /dev/null +++ b/man/PCALoad.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{PCALoad} +\alias{PCALoad} +\title{PCA Gene Loadings Accessor Function} +\usage{ +PCALoad(object, dims.use = NULL, genes.use = NULL, use.full = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{dims.use}{Dimensions to include (default is all stored dims)} + +\item{genes.use}{Genes to include (default is all genes)} +} +\value{ +PCA gene loading matrix for given genes and PCs +} +\description{ +Pull the PCA gene loadings matrix +} diff --git a/man/PCAPlot.Rd b/man/PCAPlot.Rd new file mode 100644 index 000000000..c08e67b4e --- /dev/null +++ b/man/PCAPlot.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{PCAPlot} +\alias{PCAPlot} +\title{Plot PCA map} +\usage{ +PCAPlot(object, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{\dots}{Additional parameters to DimPlot, for example, which dimensions to plot.} +} +\description{ +Graphs the output of a PCA analysis +Cells are colored by their identity class. +} +\details{ +This function is a wrapper for DimPlot. See ?DimPlot for a full list of possible +arguments which can be passed in here. +} diff --git a/man/PCASigGenes.Rd b/man/PCASigGenes.Rd new file mode 100644 index 000000000..7318b8868 --- /dev/null +++ b/man/PCASigGenes.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/jackstraw.R +\name{PCASigGenes} +\alias{PCASigGenes} +\title{Significant genes from a PCA} +\usage{ +PCASigGenes(object, pcs.use, pval.cut = 0.1, use.full = FALSE, + max.per.pc = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{pcs.use}{PCS to use.} + +\item{pval.cut}{P-value cutoff} + +\item{use.full}{Use the full list of genes (from the projected PCA). Assumes +that ProjectPCA has been run. Currently, must be set to FALSE.} + +\item{max.per.pc}{Maximum number of genes to return per PC. Used to avoid genes from one PC dominating the entire analysis.} +} +\value{ +A vector of genes whose p-values are statistically significant for +at least one of the given PCs. +} +\description{ +Returns a set of genes, based on the JackStraw analysis, that have +statistically significant associations with a set of PCs. +} diff --git a/man/PCElbowPlot.Rd b/man/PCElbowPlot.Rd new file mode 100644 index 000000000..da0610f0e --- /dev/null +++ b/man/PCElbowPlot.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{PCElbowPlot} +\alias{PCElbowPlot} +\title{Quickly Pick Relevant PCs} +\usage{ +PCElbowPlot(object, num.pc = 20) +} +\arguments{ +\item{object}{Seurat object} + +\item{num.pc}{Number of PCs to plot} +} +\value{ +Returns ggplot object +} +\description{ +Plots the standard deviations (or approximate singular values if running PCAFast) +of the principle components for easy identification of an elbow in the graph. +This elbow often corresponds well with the significant PCs and is much faster to run. +} diff --git a/man/PCHeatmap.Rd b/man/PCHeatmap.Rd new file mode 100644 index 000000000..a44d744e4 --- /dev/null +++ b/man/PCHeatmap.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{PCHeatmap} +\alias{PCHeatmap} +\title{Principal component heatmap} +\usage{ +PCHeatmap(object, pc.use = 1, cells.use = NULL, num.genes = 30, + use.full = FALSE, disp.min = -2.5, disp.max = 2.5, do.return = FALSE, + col.use = pyCols, use.scale = TRUE, do.balanced = FALSE, + remove.key = FALSE, label.columns = NULL, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{pc.use}{Principal components to use} + +\item{cells.use}{A list of cells to plot. If numeric, just plots the top cells.} + +\item{num.genes}{Number of genes to return} + +\item{use.full}{Use the full PCA (projected PCA). Default i s FALSE} + +\item{disp.min}{Minimum display value (all values below are clipped)} + +\item{disp.max}{Maximum display value (all values above are clipped)} + +\item{use.scale}{Default is TRUE: plot scaled data. If FALSE, plot raw data on the heatmap.} + +\item{do.balanced}{Return an equal number of genes with both + and - PC scores.} + +\item{remove.key}{Removes the color key from the plot.} + +\item{label.columns}{Whether to label the columns. Default is TRUE for 1 PC, FALSE for > 1 PC} +} +\value{ +If do.return==TRUE, a matrix of scaled values which would be passed +to heatmap.2. Otherwise, no return value, only a graphical output +} +\description{ +Draws a heatmap focusing on a principal component. Both cells and genes are sorted by their principal component scores. +Allows for nice visualization of sources of heterogeneity in the dataset. +} diff --git a/man/PCTopCells.Rd b/man/PCTopCells.Rd new file mode 100644 index 000000000..3b6567653 --- /dev/null +++ b/man/PCTopCells.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{PCTopCells} +\alias{PCTopCells} +\title{Find cells with highest PCA scores} +\usage{ +PCTopCells(object, pc.use = 1, num.cells = NULL, do.balanced = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{pc.use}{Principal component to use} + +\item{num.cells}{Number of cells to return} + +\item{do.balanced}{Return an equal number of cells with both + and - PC scores.} +} +\value{ +Returns a vector of cells +} +\description{ +Return a list of genes with the strongest contribution to a set of principal components +} diff --git a/man/PCTopGenes.Rd b/man/PCTopGenes.Rd new file mode 100644 index 000000000..79377c59a --- /dev/null +++ b/man/PCTopGenes.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{PCTopGenes} +\alias{PCTopGenes} +\title{Find genes with highest PCA scores} +\usage{ +PCTopGenes(object, pc.use = 1, num.genes = 30, use.full = FALSE, + do.balanced = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{pc.use}{Principal components to use} + +\item{num.genes}{Number of genes to return} + +\item{use.full}{Use the full PCA (projected PCA). Default i s FALSE} + +\item{do.balanced}{Return an equal number of genes with both + and - PC scores.} +} +\value{ +Returns a vector of genes +} +\description{ +Return a list of genes with the strongest contribution to a set of principal +components +} diff --git a/man/PlotClusterTree.Rd b/man/PlotClusterTree.Rd new file mode 100644 index 000000000..16c8362f7 --- /dev/null +++ b/man/PlotClusterTree.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{PlotClusterTree} +\alias{PlotClusterTree} +\title{Plot phylogenetic tree} +\usage{ +PlotClusterTree(object, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{\dots}{Additional arguments for plotting the phylogeny} +} +\value{ +Plots dendogram (must be precomputed using BuildClusterTree), returns no value +} +\description{ +Plots previously computed phylogenetic tree (from BuildClusterTree) +} diff --git a/man/PoissonDETest.Rd b/man/PoissonDETest.Rd new file mode 100644 index 000000000..947ef65e8 --- /dev/null +++ b/man/PoissonDETest.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{PoissonDETest} +\alias{PoissonDETest} +\title{Poisson test for UMI-count based data} +\usage{ +PoissonDETest(object, cells.1, cells.2, genes.use = NULL, + latent.vars = NULL, print.bar = TRUE) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.1}{Group 1 cells} + +\item{cells.2}{Group 2 cells} + +\item{genes.use}{Genes to test. Default is to use all genes} + +\item{print.bar}{Print a progress bar once expression testing begins (uses pbapply to do this)} +} +\value{ +Returns a p-value ranked matrix of putative differentially expressed +genes. +} +\description{ +Identifies differentially expressed genes between two groups of cells using +a poisson generalized linear model +} diff --git a/man/PrintAlignSubspaceParams.Rd b/man/PrintAlignSubspaceParams.Rd new file mode 100644 index 000000000..e2ccaa70c --- /dev/null +++ b/man/PrintAlignSubspaceParams.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/printing_utilities.R +\name{PrintAlignSubspaceParams} +\alias{PrintAlignSubspaceParams} +\title{Print AlignSubspace Calculation Parameters} +\usage{ +PrintAlignSubspaceParams(object, raw = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{raw}{Print the entire contents of the calculation settings slot +(calc.params) for the AlignSubspace calculation. Default (FALSE) will print a +nicely formatted summary.} +} +\value{ +No return value. Only prints to console. +} +\description{ +Print the parameters chosen for the latest AlignSubspace calculation for each +stored aligned subspace. +} diff --git a/man/PrintCCAParams.Rd b/man/PrintCCAParams.Rd new file mode 100644 index 000000000..8c77aa599 --- /dev/null +++ b/man/PrintCCAParams.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/printing_utilities.R +\name{PrintCCAParams} +\alias{PrintCCAParams} +\title{Print CCA Calculation Parameters} +\usage{ +PrintCCAParams(object, raw = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{raw}{Print the entire contents of the calculation settings slot +(calc.params) for the RunCCA calculation. Default (FALSE) will print a nicely +formatted summary.} +} +\value{ +No return value. Only prints to console. +} +\description{ +Print the parameters chosen for the latest stored CCA calculation. +} diff --git a/man/PrintCalcParams.Rd b/man/PrintCalcParams.Rd new file mode 100644 index 000000000..6362bebc4 --- /dev/null +++ b/man/PrintCalcParams.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/printing_utilities.R +\name{PrintCalcParams} +\alias{PrintCalcParams} +\title{Print the calculation} +\usage{ +PrintCalcParams(object, calculation, raw = FALSE, return.list = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{calculation}{Name of calculation (function name) to check parameters +for} + +\item{raw}{Print the entire contents of the calculation settings slot (calc.params) +for the RunPCA calculation.} + +\item{return.list}{Return the calculation parameters as a list} +} +\value{ +Prints the calculation settings and optionally returns them as a list +} +\description{ +Print entire contents of calculation settings slot (calc.params) for given +calculation. +} diff --git a/man/PrintCalcVarExpRatioParams.Rd b/man/PrintCalcVarExpRatioParams.Rd new file mode 100644 index 000000000..d71fe1ee8 --- /dev/null +++ b/man/PrintCalcVarExpRatioParams.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/printing_utilities.R +\name{PrintCalcVarExpRatioParams} +\alias{PrintCalcVarExpRatioParams} +\title{Print Parameters Associated with CalcVarExpRatio} +\usage{ +PrintCalcVarExpRatioParams(object, raw = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{raw}{Print the entire contents of the calculation settings slot +(calc.params) for CalcVarExpRatio. Default (FALSE) will print a nicely +formatted summary.} +} +\value{ +No return value. Only prints to console. +} +\description{ +Print the parameters chosen for CalcVarExpRatio. +} diff --git a/man/PrintDMParams.Rd b/man/PrintDMParams.Rd new file mode 100644 index 000000000..b2a5662d8 --- /dev/null +++ b/man/PrintDMParams.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/printing_utilities.R +\name{PrintDMParams} +\alias{PrintDMParams} +\title{Print Diffusion Map Calculation Parameters} +\usage{ +PrintDMParams(object, raw = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{raw}{Print the entire contents of the calculation settings slot +(calc.params) for the RunDiffusion calculation. Default (FALSE) will print a +nicely formatted summary.} +} +\value{ +No return value. Only prints to console. +} +\description{ +Print the parameters chosen for the latest stored diffusion map calculation. +} diff --git a/man/PrintDim.Rd b/man/PrintDim.Rd new file mode 100644 index 000000000..7de9bb45c --- /dev/null +++ b/man/PrintDim.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{PrintDim} +\alias{PrintDim} +\title{Print the results of a dimensional reduction analysis} +\usage{ +PrintDim(object, reduction.type = "pca", dims.print = 1:5, + genes.print = 30, use.full = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.type}{Reduction technique to print results for} + +\item{dims.print}{Number of dimensions to display} + +\item{genes.print}{Number of genes to display} + +\item{use.full}{Use full PCA (i.e. the projected PCA, by default FALSE)} +} +\value{ +Set of genes defining the components +} +\description{ +Prints a set of genes that most strongly define a set of components +} diff --git a/man/PrintFindClustersParams.Rd b/man/PrintFindClustersParams.Rd new file mode 100644 index 000000000..e387aa2c9 --- /dev/null +++ b/man/PrintFindClustersParams.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/printing_utilities.R +\name{PrintFindClustersParams} +\alias{PrintFindClustersParams} +\title{Print FindClusters Calculation Parameters} +\usage{ +PrintFindClustersParams(object, resolution, raw = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{resolution}{Optionally specify only a subset of resolutions to print +parameters for.} + +\item{raw}{Print the entire contents of the calculation settings slot +(calc.params) for the FindClusters calculation. Default (FALSE) will print a +nicely formatted summary.} +} +\value{ +No return value. Only prints to console. +} +\description{ +Print the parameters chosen for the latest FindClusters calculation for each +stored resolution. +} diff --git a/man/PrintICA.Rd b/man/PrintICA.Rd new file mode 100644 index 000000000..294d36a80 --- /dev/null +++ b/man/PrintICA.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{PrintICA} +\alias{PrintICA} +\title{Print the results of a PCA analysis} +\usage{ +PrintICA(object, ics.print = 1:5, genes.print = 30, use.full = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.print}{Number of genes to print for each PC} + +\item{use.full}{Use full PCA (i.e. the projected PCA, by default FALSE)} + +\item{pcs.print}{Set of PCs to print genes for} +} +\value{ +Only text output +} +\description{ +Prints a set of genes that most strongly define a set of principal components +} diff --git a/man/PrintICAParams.Rd b/man/PrintICAParams.Rd new file mode 100644 index 000000000..956603f12 --- /dev/null +++ b/man/PrintICAParams.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/printing_utilities.R +\name{PrintICAParams} +\alias{PrintICAParams} +\title{Print ICA Calculation Parameters} +\usage{ +PrintICAParams(object, raw = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{raw}{Print the entire contents of the calculation settings slot (calc.params) for the ICA +calculation. Default (FALSE) will print a nicely formatted summary.} +} +\value{ +No return value. Only prints to console. +} +\description{ +Print the parameters chosen for the latest stored ICA calculation. +} diff --git a/man/PrintPCA.Rd b/man/PrintPCA.Rd new file mode 100644 index 000000000..5525d48d3 --- /dev/null +++ b/man/PrintPCA.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{PrintPCA} +\alias{PrintPCA} +\title{Print the results of a PCA analysis} +\usage{ +PrintPCA(object, pcs.print = 1:5, genes.print = 30, use.full = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{pcs.print}{Set of PCs to print genes for} + +\item{genes.print}{Number of genes to print for each PC} + +\item{use.full}{Use full PCA (i.e. the projected PCA, by default FALSE)} +} +\value{ +Only text output +} +\description{ +Prints a set of genes that most strongly define a set of principal components +} diff --git a/man/PrintPCAParams.Rd b/man/PrintPCAParams.Rd new file mode 100644 index 000000000..9e211de6e --- /dev/null +++ b/man/PrintPCAParams.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/printing_utilities.R +\name{PrintPCAParams} +\alias{PrintPCAParams} +\title{Print PCA Calculation Parameters} +\usage{ +PrintPCAParams(object, raw = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{raw}{Print the entire contents of the calculation settings slot +(calc.params) for the RunPCA calculation. Default (FALSE) will print a nicely +formatted summary.} +} +\value{ +No return value. Only prints to console. +} +\description{ +Print the parameters chosen for the latest stored PCA calculation. +} diff --git a/man/PrintSNNParams.Rd b/man/PrintSNNParams.Rd new file mode 100644 index 000000000..973d5667a --- /dev/null +++ b/man/PrintSNNParams.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/printing_utilities.R +\name{PrintSNNParams} +\alias{PrintSNNParams} +\title{Print SNN Construction Calculation Parameters} +\usage{ +PrintSNNParams(object, raw = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{raw}{Print the entire contents of the calculation settings slot (calc.params) for the +BuildSNN calculation. Default (FALSE) will print a nicely formatted summary.} +} +\value{ +No return value. Only prints to console. +} +\description{ +Print the parameters chosen for the latest stored SNN calculation (via BuildSNN or FindClusters). +} diff --git a/man/PrintTSNEParams.Rd b/man/PrintTSNEParams.Rd new file mode 100644 index 000000000..116035272 --- /dev/null +++ b/man/PrintTSNEParams.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/printing_utilities.R +\name{PrintTSNEParams} +\alias{PrintTSNEParams} +\title{Print TSNE Calculation Parameters} +\usage{ +PrintTSNEParams(object, raw = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{raw}{Print the entire contents of the calculation settings slot (calc.params) for the +RunTSNE calculation. Default (FALSE) will print a nicely formatted summary.} +} +\value{ +No return value. Only prints to console. +} +\description{ +Print the parameters chosen for the latest stored TSNE calculation. +} diff --git a/man/ProjectDim.Rd b/man/ProjectDim.Rd new file mode 100644 index 000000000..dd7ac5b30 --- /dev/null +++ b/man/ProjectDim.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction.R +\name{ProjectDim} +\alias{ProjectDim} +\title{Project Dimensional reduction onto full dataset} +\usage{ +ProjectDim(object, reduction.type = "pca", dims.print = 1:5, + dims.store = 30, genes.print = 30, replace.dim = FALSE, + do.center = FALSE, do.print = TRUE, assay.type = "RNA") +} +\arguments{ +\item{object}{Seurat object} + +\item{dims.print}{Number of dims to print genes for} + +\item{dims.store}{Number of dims to store (default is 30)} + +\item{genes.print}{Number of genes with highest/lowest loadings to print for +each PC} + +\item{replace.dim}{Replace the existing data (overwrite +object@dr$XXX@gene.loadings), not done by default.} + +\item{do.center}{Center the dataset prior to projection (should be set to TRUE)} + +\item{do.print}{Print top genes associated with the projected dimensions} + +\item{assay.type}{Data type, RNA by default. Can be changed for multimodal +datasets (i.e. project a PCA done on RNA, onto CITE-seq data)} +} +\value{ +Returns Seurat object with the projected values in +object@dr$XXX@gene.loadings.full +} +\description{ +Takes a pre-computed dimensional reduction (typically calculated on a subset +of genes) and projects this onto the entire dataset (all genes). Note that +the cell loadings will remain unchanged, but now there are gene loadings for +all genes. +} diff --git a/man/ProjectPCA.Rd b/man/ProjectPCA.Rd new file mode 100644 index 000000000..2a657b2bf --- /dev/null +++ b/man/ProjectPCA.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction.R +\name{ProjectPCA} +\alias{ProjectPCA} +\title{Project Principal Components Analysis onto full dataset} +\usage{ +ProjectPCA(object, do.print = TRUE, pcs.print = 1:5, pcs.store = 30, + genes.print = 30, replace.pc = FALSE, do.center = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{do.print}{Print top genes associated with the projected PCs} + +\item{pcs.print}{Number of PCs to print genes for} + +\item{pcs.store}{Number of PCs to store (default is 30)} + +\item{genes.print}{Number of genes with highest/lowest loadings to print for +each PC} + +\item{replace.pc}{Replace the existing PCA (overwite +object@dr$pca@gene.loadings), not done by default.} + +\item{do.center}{Center the dataset prior to projection (should be set to TRUE)} +} +\value{ +Returns Seurat object with the projected PCA values in +object@dr$pca@gene.loadings.full +} +\description{ +Takes a pre-computed PCA (typically calculated on a subset of genes) and +projects this onto the entire dataset (all genes). Note that the cell +loadings remains unchanged, but now there are gene loading scores for all +genes. +} diff --git a/man/Read10X.Rd b/man/Read10X.Rd new file mode 100644 index 000000000..32094020d --- /dev/null +++ b/man/Read10X.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{Read10X} +\alias{Read10X} +\title{Load in data from 10X} +\usage{ +Read10X(data.dir = NULL) +} +\arguments{ +\item{data.dir}{Directory containing the matrix.mtx, genes.tsv, and barcodes.tsv +files provided by 10X. A vector or named vector can be given in order to load +several data directories. If a named vector is given, the cell barcode names +will be prefixed with the name.} +} +\value{ +Returns a sparse matrix with rows and columns labeled +} +\description{ +Enables easy loading of sparse data matrices provided by 10X genomics. +} diff --git a/man/RefinedMapping.Rd b/man/RefinedMapping.Rd new file mode 100644 index 000000000..1056be527 --- /dev/null +++ b/man/RefinedMapping.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/spatial.R +\name{RefinedMapping} +\alias{RefinedMapping} +\title{Quantitative refinement of spatial inferences} +\usage{ +RefinedMapping(object, genes.use) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.use}{Genes to use to drive the refinement procedure.} +} +\value{ +Seurat object, where mapping probabilities for each bin are stored +in object@final.prob +} +\description{ +Refines the initial mapping with more complex models that allow gene +expression to vary quantitatively across bins (instead of 'on' or 'off'), +and that also considers the covariance structure between genes. +} +\details{ +Full details given in spatial mapping manuscript. +} diff --git a/man/RegressOut.Rd b/man/RegressOut.Rd new file mode 100644 index 000000000..edf0137cd --- /dev/null +++ b/man/RegressOut.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing_internal.R +\name{RegressOut} +\alias{RegressOut} +\title{Regress out technical effects and cell cycle} +\usage{ +RegressOut(object, latent.vars, genes.regress = NULL, model.use = "linear", + use.umi = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{latent.vars}{effects to regress out} + +\item{genes.regress}{gene to run regression for (default is all genes)} + +\item{model.use}{Use a linear model or generalized linear model (poisson, negative binomial) for the regression. Options are 'linear' (default), 'poisson', and 'negbinom'} + +\item{use.umi}{Regress on UMI count data. Default is FALSE for linear modeling, but automatically set to TRUE if model.use is 'negbinom' or 'poisson'} +} +\value{ +Returns the residuals from the regression model +} +\description{ +Remove unwanted effects from scale.data +} +\keyword{internal} diff --git a/man/RemoveFromTable.Rd b/man/RemoveFromTable.Rd new file mode 100644 index 000000000..e7cdd0331 --- /dev/null +++ b/man/RemoveFromTable.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{RemoveFromTable} +\alias{RemoveFromTable} +\title{Remove data from a table} +\usage{ +RemoveFromTable(to.remove, data) +} +\arguments{ +\item{to.remove}{A vector of values that indicate removal} + +\item{data}{A data frame or matrix} +} +\value{ +A data frame or matrix with values removed by row +} +\description{ +This function will remove any rows from a data frame or matrix +that contain certain values +} diff --git a/man/RenameIdent.Rd b/man/RenameIdent.Rd new file mode 100644 index 000000000..8233f57b1 --- /dev/null +++ b/man/RenameIdent.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{RenameIdent} +\alias{RenameIdent} +\title{Rename one identity class to another} +\usage{ +RenameIdent(object, old.ident.name = NULL, new.ident.name = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{old.ident.name}{The old identity class (to be renamed)} + +\item{new.ident.name}{The new name to apply} +} +\value{ +A Seurat object where object@ident has been appropriately modified +} +\description{ +Can also be used to join identity classes together (for example, to merge clusters). +} diff --git a/man/ReorderIdent.Rd b/man/ReorderIdent.Rd new file mode 100644 index 000000000..b48aff67e --- /dev/null +++ b/man/ReorderIdent.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{ReorderIdent} +\alias{ReorderIdent} +\title{Reorder identity classes} +\usage{ +ReorderIdent(object, feature = "PC1", rev = FALSE, aggregate.fxn = mean, + reorder.numeric = FALSE, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{feature}{Feature to reorder on. Default is PC1} + +\item{rev}{Reverse ordering (default is FALSE)} + +\item{aggregate.fxn}{Function to evaluate each identity class based on (default is mean)} + +\item{reorder.numeric}{Rename all identity classes to be increasing numbers starting from 1 (default is FALSE)} + +\item{\dots}{additional arguemnts (i.e. use.imputed=TRUE)} +} +\value{ +A seurat object where the identity have been re-oredered based on the average. +} +\description{ +Re-assigns the identity classes according to the average expression of a particular feature (i.e, gene expression, or PC score) +Very useful after clustering, to re-order cells, for example, based on PC scores +} diff --git a/man/RunCCA.Rd b/man/RunCCA.Rd new file mode 100644 index 000000000..42299c6e9 --- /dev/null +++ b/man/RunCCA.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction.R +\name{RunCCA} +\alias{RunCCA} +\title{Perform Canonical Correlation Analysis} +\usage{ +RunCCA(object, object2, group1, group2, group.by, num.cc = 20, genes.use, + scale.data = TRUE, rescale.groups = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{object2}{Optional second object. If object2 is passed, object1 will be +considered as group1 and object2 as group2.} + +\item{group1}{First set of cells (or IDs) for CCA} + +\item{group2}{Second set of cells (or IDs) for CCA} + +\item{group.by}{Factor to group by (column vector stored in object@meta.data)} + +\item{num.cc}{Number of canonical vectors to calculate} + +\item{genes.use}{Set of genes to use in CCA. Default is object@var.genes. If +two objects are given, the default is the union of both variable gene sets +that are also present in both objects.} + +\item{scale.data}{Use the scaled data from the object} + +\item{rescale.groups}{Rescale each set of cells independently} +} +\value{ +Returns Seurat object with the CCA stored in the @dr$cca slot. If +one object is passed, the same object is returned. If two are passed, a +combined object is returned. +} +\description{ +Runs a canonical correlation analysis using a diagonal implementation of CCA. +For details about stored CCA calculation parameters, see +\code{\link{PrintCCAParams}}. +} diff --git a/man/RunDiffusion.Rd b/man/RunDiffusion.Rd new file mode 100644 index 000000000..e06274b02 --- /dev/null +++ b/man/RunDiffusion.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction.R +\name{RunDiffusion} +\alias{RunDiffusion} +\title{Run diffusion map} +\usage{ +RunDiffusion(object, cells.use = NULL, dims.use = 1:5, genes.use = NULL, + reduction.use = "pca", q.use = 0.01, max.dim = 2, scale.clip = 10, + ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.use}{Which cells to analyze (default, all cells)} + +\item{dims.use}{Which dimensions to use as input features} + +\item{genes.use}{If set, run the diffusion map procedure on this subset of +genes (instead of running on a set of reduced dimensions). Not set (NULL) by +default} + +\item{reduction.use}{Which dimensional reduction (PCA or ICA) to use for the +diffusion map. Default is PCA} + +\item{q.use}{Quantile to use} + +\item{max.dim}{Max dimension to keep from diffusion calculation} + +\item{scale.clip}{Max/min value for scaled data. Default is 3} + +\item{...}{Additional arguments to the diffuse call} +} +\value{ +Returns a Seurat object with a diffusion map +} +\description{ +Run diffusion map +} diff --git a/man/RunICA.Rd b/man/RunICA.Rd new file mode 100644 index 000000000..9faf4b12f --- /dev/null +++ b/man/RunICA.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction.R +\name{RunICA} +\alias{RunICA} +\title{Run Independent Component Analysis on gene expression} +\usage{ +RunICA(object, ic.genes = NULL, ics.compute = 50, use.imputed = FALSE, + rev.ica = FALSE, print.results = TRUE, ics.print = 1:5, + genes.print = 50, ica.function = "icafast", seed.use = 1, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{ic.genes}{Genes to use as input for ICA. Default is object@var.genes} + +\item{ics.compute}{Number of ICs to compute} + +\item{use.imputed}{Run ICA on imputed values (FALSE by default)} + +\item{rev.ica}{By default, computes the dimensional reduction on the cell x +gene matrix. Setting to true will compute it on the transpose (gene x cell +matrix).} + +\item{print.results}{Print the top genes associated with each dimension} + +\item{ics.print}{ICs to print genes for} + +\item{genes.print}{Number of genes to print for each IC} + +\item{ica.function}{ICA function from ica package to run (options: icafast, +icaimax, icajade)} + +\item{seed.use}{Random seed to use for fastica} + +\item{\dots}{Additional arguments to be passed to fastica} +} +\value{ +Returns Seurat object with an ICA calculation stored in +object@dr$ica +} +\description{ +Run fastica algorithm from the ica package for ICA dimensionality reduction. +For details about stored ICA calculation parameters, see +\code{\link{PrintICAParams}}. +} diff --git a/man/RunPCA.Rd b/man/RunPCA.Rd new file mode 100644 index 000000000..7c1723818 --- /dev/null +++ b/man/RunPCA.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction.R +\name{RunPCA} +\alias{RunPCA} +\title{Run Principal Component Analysis on gene expression using IRLBA} +\usage{ +RunPCA(object, pc.genes = NULL, pcs.compute = 20, use.imputed = FALSE, + rev.pca = FALSE, weight.by.var = TRUE, do.print = TRUE, + pcs.print = 1:5, genes.print = 30, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{pc.genes}{Genes to use as input for PCA. Default is object@var.genes} + +\item{pcs.compute}{Total Number of PCs to compute and store} + +\item{use.imputed}{Run PCA on imputed values (FALSE by default)} + +\item{rev.pca}{By default computes the PCA on the cell x gene matrix. Setting +to true will compute it on gene x cell matrix.} + +\item{weight.by.var}{Weight the cell embeddings by the variance of each PC +(weights the gene loadings if rev.pca is TRUE)} + +\item{do.print}{Print the top genes associated with high/low loadings for +the PCs} + +\item{pcs.print}{PCs to print genes for} + +\item{genes.print}{Number of genes to print for each PC} + +\item{\dots}{Additional arguments to be passed to IRLBA} +} +\value{ +Returns Seurat object with the PCA calculation stored in +object@dr$pca. +} +\description{ +Run a PCA dimensionality reduction. For details about stored PCA calculation +parameters, see \code{\link{PrintPCAParams}}. +} diff --git a/man/RunTSNE.Rd b/man/RunTSNE.Rd new file mode 100644 index 000000000..68c49dd57 --- /dev/null +++ b/man/RunTSNE.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction.R +\name{RunTSNE} +\alias{RunTSNE} +\title{Run t-distributed Stochastic Neighbor Embedding} +\usage{ +RunTSNE(object, reduction.use = "pca", cells.use = NULL, dims.use = 1:5, + genes.use = NULL, seed.use = 1, do.fast = TRUE, add.iter = 0, + dim.embed = 2, distance.matrix = NULL, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.use}{Which dimensional reduction (e.g. PCA, ICA) to use for +the tSNE. Default is PCA} + +\item{cells.use}{Which cells to analyze (default, all cells)} + +\item{dims.use}{Which dimensions to use as input features} + +\item{genes.use}{If set, run the tSNE on this subset of genes +(instead of running on a set of reduced dimensions). Not set (NULL) by default} + +\item{seed.use}{Random seed for the t-SNE} + +\item{do.fast}{If TRUE, uses the Barnes-hut implementation, which runs +faster, but is less flexible. TRUE by default.} + +\item{add.iter}{If an existing tSNE has already been computed, uses the +current tSNE to seed the algorithm and then adds additional iterations on top +of this} + +\item{dim.embed}{The dimensional space of the resulting tSNE embedding +(default is 2). For example, set to 3 for a 3d tSNE} + +\item{distance.matrix}{If set, tuns tSNE on the given distance matrix +instead of data matrix (experimental)} + +\item{\dots}{Additional arguments to the tSNE call. Most commonly used is +perplexity (expected number of neighbors default is 30)} +} +\value{ +Returns a Seurat object with a tSNE embedding in +object@dr$tsne@cell.embeddings +} +\description{ +Run t-SNE dimensionality reduction on selected features. Has the option of +running in a reduced dimensional space (i.e. spectral tSNE, recommended), +or running based on a set of genes. For details about stored TSNE calculation +parameters, see \code{\link{PrintTSNEParams}}. +} diff --git a/man/SampleUMI.Rd b/man/SampleUMI.Rd new file mode 100644 index 000000000..5a342e284 --- /dev/null +++ b/man/SampleUMI.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{SampleUMI} +\alias{SampleUMI} +\title{Sample UMI} +\usage{ +SampleUMI(data, max.umi = 1000, upsample = FALSE, progress.bar = TRUE) +} +\arguments{ +\item{data}{Matrix with the raw count data} + +\item{max.umi}{Number of UMIs to sample to} + +\item{upsample}{Upsamples all cells with fewer than max.umi} + +\item{progress.bar}{Display the progress bar} +} +\value{ +Matrix with downsampled data +} +\description{ +Downsample each cell to a specified number of UMIs. Includes +an option to upsample cells below specified UMI as well. +} diff --git a/man/SaveClusters.Rd b/man/SaveClusters.Rd new file mode 100644 index 000000000..06f1e93b3 --- /dev/null +++ b/man/SaveClusters.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_determination.R +\name{SaveClusters} +\alias{SaveClusters} +\title{Save cluster assignments to a TSV file} +\usage{ +SaveClusters(object, file) +} +\arguments{ +\item{object}{Seurat object with cluster assignments} + +\item{file}{Path to file to write cluster assignments to} +} +\value{ +No return value. Writes clusters assignments to specified file. +} +\description{ +Save cluster assignments to a TSV file +} diff --git a/man/ScaleData.Rd b/man/ScaleData.Rd new file mode 100644 index 000000000..ab3f72099 --- /dev/null +++ b/man/ScaleData.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{ScaleData} +\alias{ScaleData} +\title{Scale and center the data.} +\usage{ +ScaleData(object, genes.use = NULL, data.use = NULL, latent.vars, + model.use = "linear", use.umi = FALSE, do.scale = TRUE, + do.center = TRUE, scale.max = 10, block.size = 1000, + min.cells.to.block = 3000, display.progress = TRUE, assay.type = "RNA", + do.cpp = TRUE) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.use}{Vector of gene names to scale/center. Default is all genes +in object@data.} + +\item{data.use}{Can optionally pass a matrix of data to scale, default is +object@data[genes.use, ]} + +\item{latent.vars}{effects to regress out} + +\item{model.use}{Use a linear model or generalized linear model +(poisson, negative binomial) for the regression. Options are 'linear' +(default), 'poisson', and 'negbinom'} + +\item{use.umi}{Regress on UMI count data. Default is FALSE for linear +modeling, but automatically set to TRUE if model.use is 'negbinom' or 'poisson'} + +\item{do.scale}{Whether to scale the data.} + +\item{do.center}{Whether to center the data.} + +\item{scale.max}{Max value to return for scaled data. The default is 10. +Setting this can help reduce the effects of genes that are only expressed in +a very small number of cells.} + +\item{block.size}{Default size for number of genes to scale at in a single +computation. Increasing block.size may speed up calculations but at an +additional memory cost.} + +\item{min.cells.to.block}{If object contains fewer than this number of cells, +don't block for scaling calculations.} + +\item{display.progress}{Displays a progress bar for scaling procedure} + +\item{assay.type}{Assay to scale data for. Default is RNA. Can be changed for +multimodal analyses.} + +\item{do.cpp}{By default (TRUE), most of the heavy lifting is done in c++. +We've maintained support for our previous implementation in R for +reproducibility (set this to FALSE) as results can change slightly due to +differences in numerical precision which could affect downstream calculations.} +} +\value{ +Returns a seurat object with object@scale.data updated with scaled +and/or centered data. +} +\description{ +Scales and centers the data. If latent variables are provided (latent.vars), their effects are +removed through regression and the resulting residuals are then scaled and centered. +} diff --git a/man/ScaleDataR.Rd b/man/ScaleDataR.Rd new file mode 100644 index 000000000..6fdd6beed --- /dev/null +++ b/man/ScaleDataR.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{ScaleDataR} +\alias{ScaleDataR} +\title{Old R based implementation of ScaleData. Scales and centers the data} +\usage{ +ScaleDataR(object, genes.use = NULL, data.use = NULL, do.scale = TRUE, + do.center = TRUE, scale.max = 10) +} +\arguments{ +\item{object}{Seurat object} + +\item{genes.use}{Vector of gene names to scale/center. Default is all genes in object@data.} + +\item{data.use}{Can optionally pass a matrix of data to scale, default is object@data[genes.use,]} + +\item{do.scale}{Whether to scale the data.} + +\item{do.center}{Whether to center the data.} + +\item{scale.max}{Max value to accept for scaled data. The default is 10. Setting this can help +reduce the effects of genes that are only expressed in a very small number of cells.} +} +\value{ +Returns a seurat object with object@scale.data updated with scaled and/or centered data. +} +\description{ +Old R based implementation of ScaleData. Scales and centers the data +} diff --git a/man/SetAllIdent.Rd b/man/SetAllIdent.Rd new file mode 100644 index 000000000..a3ead6ed8 --- /dev/null +++ b/man/SetAllIdent.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{SetAllIdent} +\alias{SetAllIdent} +\title{Switch identity class definition to another variable} +\usage{ +SetAllIdent(object, id = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{id}{Variable to switch identity class to (for example, 'DBclust.ident', the output +of density clustering) Default is orig.ident - the original annotation pulled from the cell name.} +} +\value{ +A Seurat object where object@ident has been appropriately modified +} +\description{ +Switch identity class definition to another variable +} diff --git a/man/SetAssayData.Rd b/man/SetAssayData.Rd new file mode 100644 index 000000000..4d4d458e2 --- /dev/null +++ b/man/SetAssayData.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multi_modal.R +\name{SetAssayData} +\alias{SetAssayData} +\title{Assay Data Mutator Function} +\usage{ +SetAssayData(object, assay.type, slot, new.data) +} +\arguments{ +\item{object}{Seurat object} + +\item{assay.type}{Type of assay to fetch data for (default is RNA)} + +\item{slot}{Specific information to pull (i.e. raw.data, data, scale.data,...). Default is data} + +\item{new.data}{New data to insert} +} +\value{ +Seurat object with updated slot +} +\description{ +Store information for specified assay, for multimodal analysis +} diff --git a/man/SetClusters.Rd b/man/SetClusters.Rd new file mode 100644 index 000000000..166db335d --- /dev/null +++ b/man/SetClusters.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_determination.R +\name{SetClusters} +\alias{SetClusters} +\title{Set Cluster Assignments} +\usage{ +SetClusters(object, clusters = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{clusters}{A dataframe containing the cell names and cluster assignments +to set for the object.} +} +\value{ +Returns a Seurat object with the identities set to the cluster +assignments that were passed. +} +\description{ +Easily set the cluster assignments using the output of GetClusters() --- +a dataframe with cell names as the first column and cluster assignments as +the second. +} diff --git a/man/SetDimReduction.Rd b/man/SetDimReduction.Rd new file mode 100644 index 000000000..782ac8692 --- /dev/null +++ b/man/SetDimReduction.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimensional_reduction_utilities.R +\name{SetDimReduction} +\alias{SetDimReduction} +\title{Dimensional Reduction Mutator Function} +\usage{ +SetDimReduction(object, reduction.type, slot, new.data) +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.type}{Type of dimensional reduction to set} + +\item{slot}{Specific information to set (must be one of the following: +"cell.embeddings", "gene.loadings", "gene.loadings.full", "sdev", "key", +"misc")} + +\item{new.data}{New data to set} +} +\value{ +Seurat object with updated slot +} +\description{ +Set information for specified stored dimensional reduction analysis +} diff --git a/man/SetIdent.Rd b/man/SetIdent.Rd new file mode 100644 index 000000000..cace68a85 --- /dev/null +++ b/man/SetIdent.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{SetIdent} +\alias{SetIdent} +\title{Set identity class information} +\usage{ +SetIdent(object, cells.use = NULL, ident.use = NULL) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.use}{Vector of cells to set identity class info for (default is +all cells)} + +\item{ident.use}{Vector of identity class values to assign (character +vector)} +} +\value{ +A Seurat object where object@ident has been appropriately modified +} +\description{ +Sets the identity class value for a subset (or all) cells +} diff --git a/man/Seurat-deprecated.Rd b/man/Seurat-deprecated.Rd new file mode 100644 index 000000000..bebb95e33 --- /dev/null +++ b/man/Seurat-deprecated.Rd @@ -0,0 +1,200 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deprecated_functions.R +\name{Seurat-deprecated} +\alias{Seurat-deprecated} +\alias{vlnPlot} +\alias{subsetData} +\alias{mean.var.plot} +\alias{pca} +\alias{PCA} +\alias{project.pca} +\alias{print.pca} +\alias{viz.pca} +\alias{set.ident} +\alias{pca.plot} +\alias{pcHeatmap} +\alias{jackStraw} +\alias{jackStrawPlot} +\alias{run_tsne} +\alias{tsne.plot} +\alias{find.markers} +\alias{find_all_markers} +\alias{genePlot} +\alias{feature.plot} +\alias{tnse.plot} +\alias{buildClusterTree} +\alias{plotClusterTree} +\alias{plotNoiseModel} +\alias{add_samples} +\alias{subsetCells} +\alias{project.samples} +\alias{run_diffusion} +\alias{ica} +\alias{ICA} +\alias{cluster.alpha} +\alias{reorder.ident} +\alias{average.pca} +\alias{average.expression} +\alias{icTopGenes} +\alias{pcTopGenes} +\alias{pcTopCells} +\alias{fetch.data} +\alias{viz.ica} +\alias{regulatorScore} +\alias{find.markers.node} +\alias{diffExp.test} +\alias{tobit.test} +\alias{batch.gene} +\alias{marker.test} +\alias{diff.t.test} +\alias{which.cells} +\alias{set.all.ident} +\alias{rename.ident} +\alias{posterior.plot} +\alias{map.cell} +\alias{get.centroids} +\alias{refined.mapping} +\alias{initial.mapping} +\alias{calc.insitu} +\alias{fit.gene.k} +\alias{fit.gene.mix} +\alias{addSmoothedScore} +\alias{addImputedScore} +\alias{getNewScore} +\alias{calcNoiseModels} +\alias{feature.plot.keynote} +\alias{feature.heatmap} +\alias{ica.plot} +\alias{dim.plot} +\alias{spatial.de} +\alias{DBclust_dimension} +\alias{Kclust_dimension} +\alias{pca.sig.genes} +\alias{doHeatMap} +\alias{icHeatmap} +\alias{doKMeans} +\alias{genes.in.cluster} +\alias{kMeansHeatmap} +\alias{cell.cor.matrix} +\alias{gene.cor.matrix} +\alias{calinskiPlot} +\alias{dot.plot} +\alias{addMetaData} +\alias{removePC} +\alias{geneScorePlot} +\alias{cellPlot} +\alias{jackStraw.permutation.test} +\alias{jackStrawMC} +\alias{jackStrawFull} +\alias{writ.table} +\alias{jackRandom} +\alias{MeanVarPlot} +\alias{HeatmapNode} +\alias{minusr} +\alias{minusc} +\title{Deprecated function(s) in the Seurat package} +\usage{ +vlnPlot(...) +} +\arguments{ +\item{...}{Parameters to be passed to the modern version of the function} +} +\description{ +These functions are provided for compatibility with older version of the Seurat package. They may eventually be completely removed. +} +\section{Details}{ + +\tabular{rl}{ + \code{vlnPlot} \tab now a synonym for \code{\link{VlnPlot}}\cr + \code{subsetData} \tab now a synonym for \code{\link{SubsetData}}\cr + \code{mean.var.plot} \tab now a synonym for \code{\link{MeanVarPlot}}\cr + \code{pca} \tab now a synonym for \code{\link{RunPCA}}\cr + \code{PCA} \tab now a synonym for \code{\link{PCA}}\cr + \code{project.pca} \tab now a synonym for \code{\link{ProjectPCA}}\cr + \code{print.pca} \tab now a synonym for \code{\link{PrintPCA}}\cr + \code{viz.pca} \tab now a synonym for \code{\link{VizPCA}}\cr + \code{set.ident} \tab now a synonym for \code{\link{SetIdent}}\cr + \code{pca.plot} \tab now a synonym for \code{\link{PCAPlot}}\cr + \code{pcHeatmap} \tab now a synonym for \code{\link{PCHeatmap}}\cr + \code{jackStraw} \tab now a synonym for \code{\link{JackStraw}}\cr + \code{jackStrawPlot} \tab now a synonym for \code{\link{JackStrawPlot}}\cr + \code{run_tsne} \tab now a synonym for \code{\link{RunTSNE}}\cr + \code{tsne.plot} \tab now a synonym for \code{\link{TSNEPlot}}\cr + \code{find.markers} \tab now a synonym for \code{\link{FindMarkers}}\cr + \code{find_all_markers} \tab now a synonym for \code{\link{FindAllMarkers}}\cr + \code{genePlot} \tab now a synonym for \code{\link{GenePlot}}\cr + \code{feature.plot} \tab now a synonym for \code{\link{FeaturePlot}}\cr + \code{buildClusterTree} \tab now a synonym for \code{\link{BuildClusterTree}}\cr + \code{plotClusterTree} \tab now a synonym for \code{\link{PlotClusterTree}}\cr + \code{plotNoiseModel} \tab now a synonym for \code{\link{PlotNoiseModel}}\cr + \code{add_samples} \tab now a synonym for \code{\link{AddSamples}}\cr + \code{subsetCells} \tab now deleted\cr + \code{project.samples} \tab now a synonym for \code{\link{ProjectSamples}}\cr + \code{run_diffusion} \tab now a synonym for \code{\link{RunDiffusion}}\cr + \code{ica} \tab now a synonym for \code{\link{RunICA}}\cr + \code{ICA} \tab now a synonym for \code{\link{RunICA}}\cr + \code{cluster.alpha} \tab now a synonym for \code{\link{AverageDetectionRate}}\cr + \code{reorder.ident} \tab now a synonym for \code{\link{ReorderIdent}}\cr + \code{average.pca} \tab now a synonym for \code{\link{AveragePCA}}\cr + \code{average.expression} \tab now a synonym for \code{\link{AverageExpression}}\cr + \code{icTopGenes} \tab now a synonym for \code{\link{ICTopGenes}}\cr + \code{pcTopGenes} \tab now a synonym for \code{\link{PCTopGenes}}\cr + \code{pcTopCells} \tab now a synonym for \code{\link{PCTopCells}}\cr + \code{fetch.data} \tab now a synonym for \code{\link{FetchData}}\cr + \code{viz.ica} \tab now a synonym for \code{\link{VizIca}}\cr + \code{regulatorScore} \tab now deleted\cr + \code{find.markers.node} \tab now a synonym for \code{\link{FindMarkersNode}}\cr + \code{diffExp.test} \tab now a synonym for \code{\link{DiffExpTest}}\cr + \code{tobit.test} \tab now a synonym for \code{\link{TobitTest}}\cr + \code{batch.gene} \tab now a synonym for \code{\link{BatchGene}}\cr + \code{marker.test} \tab now a synonym for \code{\link{MarkerTest}}\cr + \code{diff.t.test} \tab now a synonym for \code{\link{DiffTTest}}\cr + \code{which.cells} \tab now a synonym for \code{\link{WhichCells}}\cr + \code{set.all.ident} \tab now a synonym for \code{\link{SetAllIdent}}\cr + \code{rename.ident} \tab now a synonym for \code{\link{RenameIdent}}\cr + \code{posterior.plot} \tab now a synonym for \code{\link{PosteriorPlot}}\cr + \code{map.cell} \tab now a synonym for \code{\link{MapCell}}\cr + \code{get.centroids} \tab now a synonym for \code{\link{GetCentroids}}\cr + \code{refined.mapping} \tab now a synonym for \code{\link{RefinedMapping}}\cr + \code{initial.mapping} \tab now a synonym for \code{\link{InitialMapping}}\cr + \code{calc.insitu} \tab now a synonym for \code{\link{CalcInsitu}}\cr + \code{fit.gene.k} \tab now a synonym for \code{\link{FitGeneK}}\cr + \code{fit.gene.mix} \tab now a synonym for \code{\link{FitGeneMix}}\cr + \code{addSmoothedScore} \tab now a synonym for \code{\link{AddSmoothedScore}}\cr + \code{addImputedScore} \tab now a synonym for \code{\link{AddImputedScore}}\cr + \code{getNewScore} \tab now a synonym for \code{\link{GetNewScore}}\cr + \code{calcNoiseModels} \tab now a synonym for \code{\link{CalcNoiseModels}}\cr + \code{feature.plot.keynote} \tab now a synonym for \code{\link{FeaturePlotKeynote}}\cr + \code{feature.heatmap} \tab now a synonym for \code{\link{FeatureHeatmap}}\cr + \code{ica.plot} \tab now a synonym for \code{\link{ICAPlot}}\cr + \code{dim.plot} \tab now a synonym for \code{\link{DimPlot}}\cr + \code{spatial.de} \tab now a synonym for \code{\link{SpatialDe}}\cr + \code{DBclust_dimension} \tab now a synonym for \code{\link{DBClustDimension}}\cr + \code{Kclust_dimension} \tab now a synonym for \code{\link{KClustDimension}}\cr + \code{pca.sig.genes} \tab now a synonym for \code{\link{PCASigGenes}}\cr + \code{doHeatMap} \tab now a synonym for \code{\link{DoHeatMap}}\cr + \code{icHeatmap} \tab now a synonym for \code{\link{ICHeatmap}}\cr + \code{doKMeans} \tab now a synonym for \code{\link{DoKMeans}}\cr + \code{genes.in.cluster} \tab now a synonym for \code{\link{GenesInCluster}}\cr + \code{kMeansHeatmap} \tab now a synonym for \code{\link{KMeansHeatmap}}\cr + \code{cell.cor.matrix} \tab now a synonym for \code{\link{CellCorMatrix}}\cr + \code{gene.cor.matrix} \tab now a synonym for \code{\link{GeneCorMatrix}}\cr + \code{calinskiPlot} \tab now a synonym for \code{\link{CalinskiPlot}}\cr + \code{dot.plot} \tab now a synonym for \code{\link{DotPlot}}\cr + \code{addMetaData} \tab now a synonym for \code{\link{AddMetaData}}\cr + \code{removePC} \tab now a synonym for \code{\link{RemovePC}}\cr + \code{geneScorePlot} \tab now deleted\cr + \code{cellPlot} \tab now a synonym for \code{\link{CellPlot}}\cr + \code{jackStraw.permutation.test} \tab now a synonym for \code{\link{JackStrawPermutationTest}}\cr + \code{jackStrawMC} \tab now a synonym for \code{\link{JackStrawMC}}\cr + \code{jackStrawFull} \tab now a synonym for \code{\link{JackStrawFull}}\cr + \code{PCAFast} \tab now a synonym for \code{\link{PCA}}\cr + \code{writ.table} \tab is delteded without replacement\cr + \code{jackRandom} \tab now a synonym for \code{\link{JackRandom}}\cr + \code{MeanVarPlot} \tab now a synonym for \code{\link{FindVariableGenes}}\cr + \code{myPalette} \tab now a synonym for \code{\link{CustomPalette}}\cr + \code{minusr} \tab now a synonym for \code{\link{SubsetRow}}\cr + \code{minusc} \tab now a synonym for \code{\link{SubsetColumn}}\cr +} +} + diff --git a/man/Shuffle.Rd b/man/Shuffle.Rd new file mode 100644 index 000000000..9bfbed8f8 --- /dev/null +++ b/man/Shuffle.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{Shuffle} +\alias{Shuffle} +\title{Shuffle a vector} +\usage{ +Shuffle(x) +} +\arguments{ +\item{x}{A vector} +} +\value{ +A vector with the same values of x, just in random order +} +\description{ +Shuffle a vector +} diff --git a/man/SplitDotPlotGG.Rd b/man/SplitDotPlotGG.Rd new file mode 100644 index 000000000..a9d4d08f0 --- /dev/null +++ b/man/SplitDotPlotGG.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{SplitDotPlotGG} +\alias{SplitDotPlotGG} +\title{Split Dot plot visualization} +\usage{ +SplitDotPlotGG(object, grouping.var, genes.plot, gene.groups, + cols.use = c("green", "red"), col.min = -2.5, col.max = 2.5, + dot.min = 0, dot.scale = 6, group.by, plot.legend = FALSE, + do.return = FALSE, x.lab.rot = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{grouping.var}{Grouping variable for splitting the dataset} + +\item{genes.plot}{Input vector of genes} + +\item{gene.groups}{Add labeling bars to the top of the plot} + +\item{cols.use}{colors to plot} + +\item{col.min}{Minimum scaled average expression threshold (everything smaller will be set to this)} + +\item{col.max}{Maximum scaled average expression threshold (everything larger will be set to this)} + +\item{dot.min}{The fraction of cells at which to draw the smallest dot (default is 0.05).} + +\item{dot.scale}{Scale the size of the points, similar to cex} + +\item{group.by}{Factor to group the cells by} + +\item{plot.legend}{plots the legends} + +\item{do.return}{Return ggplot2 object} + +\item{x.lab.rot}{Rotate x-axis labels} +} +\value{ +default, no return, only graphical output. If do.return=TRUE, returns a ggplot2 object +} +\description{ +Intuitive way of visualizing how gene expression changes across different identity classes (clusters). +The size of the dot encodes the percentage of cells within a class, while the color encodes the +AverageExpression level of 'expressing' cells (green is high). Splits the cells into two groups based on a +grouping variable. +Still in BETA +} diff --git a/man/StashIdent.Rd b/man/StashIdent.Rd new file mode 100644 index 000000000..445d0099d --- /dev/null +++ b/man/StashIdent.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{StashIdent} +\alias{StashIdent} +\title{Set identity class information} +\usage{ +StashIdent(object, save.name = "oldIdent") +} +\arguments{ +\item{object}{Seurat object} + +\item{save.name}{Store current object@ident under this column name in object@meta.data. Can be easily retrived with SetAllIdent} +} +\value{ +A Seurat object where object@ident has been appropriately modified +} +\description{ +Stashes the identity in data.info to be retrieved later. Useful if, for example, testing multiple clustering parameters +} diff --git a/man/SubsetColumn.Rd b/man/SubsetColumn.Rd new file mode 100644 index 000000000..e8c0b7ed8 --- /dev/null +++ b/man/SubsetColumn.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{SubsetColumn} +\alias{SubsetColumn} +\title{Return a subset of columns for a matrix or data frame} +\usage{ +SubsetColumn(data, code, invert = FALSE) +} +\arguments{ +\item{data}{Matrix or data frame with column names} + +\item{code}{Pattern for matching within column names} + +\item{invert}{Invert the search?} +} +\value{ +Returns a subset of data. If invert = TRUE, returns data where colnames +do not contain code, otherwise returns data where colnames contain code +} +\description{ +Return a subset of columns for a matrix or data frame +} diff --git a/man/SubsetData.Rd b/man/SubsetData.Rd new file mode 100644 index 000000000..aa1457f62 --- /dev/null +++ b/man/SubsetData.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{SubsetData} +\alias{SubsetData} +\title{Return a subset of the Seurat object} +\usage{ +SubsetData(object, cells.use = NULL, subset.name = NULL, ident.use = NULL, + ident.remove = NULL, accept.low = -Inf, accept.high = Inf, + do.center = FALSE, do.scale = FALSE, max.cells.per.ident = Inf, + random.seed = 1, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.use}{A vector of cell names to use as a subset. If NULL +(default), then this list will be computed based on the next three +arguments. Otherwise, will return an object consissting only of these cells} + +\item{subset.name}{Parameter to subset on. Eg, the name of a gene, PC1, a +column name in object@meta.data, etc. Any argument that can be retreived +using FetchData} + +\item{ident.use}{Create a cell subset based on the provided identity classes} + +\item{ident.remove}{Subtract out cells from these identity classes (used for filtration)} + +\item{accept.low}{Low cutoff for the parameter (default is -Inf)} + +\item{accept.high}{High cutoff for the parameter (default is Inf)} + +\item{do.center}{Recenter the new object@scale.data} + +\item{do.scale}{Rescale the new object@scale.data. FALSE by default} + +\item{max.cells.per.ident}{Can be used to downsample the data to a certain max per cell ident. Default is inf.} + +\item{random.seed}{Random seed for downsampling} + +\item{\dots}{Additional arguments to be passed to FetchData (for example, +use.imputed=TRUE)} +} +\value{ +Returns a Seurat object containing only the relevant subset of cells +} +\description{ +Creates a Seurat object containing only a subset of the cells in the +original object. Takes either a list of cells to use as a subset, or a +parameter (for example, a gene), to subset on. +} diff --git a/man/SubsetRow.Rd b/man/SubsetRow.Rd new file mode 100644 index 000000000..50b2f845c --- /dev/null +++ b/man/SubsetRow.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{SubsetRow} +\alias{SubsetRow} +\title{Return a subset of rows for a matrix or data frame} +\usage{ +SubsetRow(data, code, invert = FALSE) +} +\arguments{ +\item{data}{Matrix or data frame with row names} + +\item{code}{Pattern for matching within row names} + +\item{invert}{Invert the search?} +} +\value{ +Returns a subset of data. If invert = TRUE, returns data where rownames +do not contain code, otherwise returns data where rownames contain code +} +\description{ +Return a subset of rows for a matrix or data frame +} diff --git a/man/TSNEPlot.Rd b/man/TSNEPlot.Rd new file mode 100644 index 000000000..4002ad5ac --- /dev/null +++ b/man/TSNEPlot.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{TSNEPlot} +\alias{TSNEPlot} +\title{Plot tSNE map} +\usage{ +TSNEPlot(object, do.label = FALSE, pt.size = 1, label.size = 4, + cells.use = NULL, colors.use = NULL, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{do.label}{FALSE by default. If TRUE, plots an alternate view where the center of each +cluster is labeled} + +\item{pt.size}{Set the point size} + +\item{label.size}{Set the size of the text labels} + +\item{cells.use}{Vector of cell names to use in the plot.} + +\item{colors.use}{Manually set the color palette to use for the points} + +\item{\dots}{Additional parameters to DimPlot, for example, which dimensions to plot.} +} +\description{ +Graphs the output of a tSNE analysis +Cells are colored by their identity class. +} +\details{ +This function is a wrapper for DimPlot. See ?DimPlot for a full list of possible +arguments which can be passed in here. +} +\seealso{ +DimPlot +} diff --git a/man/TobitTest.Rd b/man/TobitTest.Rd new file mode 100644 index 000000000..ec59bf74d --- /dev/null +++ b/man/TobitTest.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{TobitTest} +\alias{TobitTest} +\title{Differential expression testing using Tobit models} +\usage{ +TobitTest(object, cells.1, cells.2, genes.use = NULL, print.bar = TRUE) +} +\arguments{ +\item{object}{Seurat object} + +\item{cells.1}{Group 1 cells} + +\item{cells.2}{Group 2 cells} + +\item{genes.use}{Genes to test. Default is to use all genes} + +\item{print.bar}{Print a progress bar once expression testing begins (uses pbapply to do this)} +} +\value{ +Returns a p-value ranked matrix of putative differentially expressed +genes. +} +\description{ +Identifies differentially expressed genes between two groups of cells using +Tobit models, as proposed in Trapnell et al., Nature Biotechnology, 2014 +} diff --git a/man/UpdateSeuratObject.Rd b/man/UpdateSeuratObject.Rd new file mode 100644 index 000000000..1c5c54391 --- /dev/null +++ b/man/UpdateSeuratObject.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{UpdateSeuratObject} +\alias{UpdateSeuratObject} +\title{Update old Seurat object to accomodate new features} +\usage{ +UpdateSeuratObject(object) +} +\arguments{ +\item{object}{Seurat object} +} +\value{ +Returns a Seurat object compatible with latest changes +} +\description{ +Updates Seurat objects to new structure for storing data/calculations. +} diff --git a/man/ValidateClusters.Rd b/man/ValidateClusters.Rd new file mode 100644 index 000000000..172a3bc90 --- /dev/null +++ b/man/ValidateClusters.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_validation.R +\name{ValidateClusters} +\alias{ValidateClusters} +\title{Cluster Validation} +\usage{ +ValidateClusters(object, pc.use = NULL, top.genes = 30, + min.connectivity = 0.01, acc.cutoff = 0.9, verbose = TRUE) +} +\arguments{ +\item{object}{Seurat object} + +\item{pc.use}{Which PCs to use to define genes in model construction} + +\item{top.genes}{Use the top X genes for each PC in model construction} + +\item{min.connectivity}{Threshold of connectedness for comparison of two +clusters} + +\item{acc.cutoff}{Accuracy cutoff for classifier} + +\item{verbose}{Controls whether to display progress and merging results} +} +\value{ +Returns a Seurat object, object@ident has been updated with new +cluster info +} +\description{ +Methods for validating the legitimacy of clusters using classification. SVMs +are used as the basis for the classification. Merging is done based on the +connectivity from an SNN graph. +} diff --git a/man/ValidateSpecificClusters.Rd b/man/ValidateSpecificClusters.Rd new file mode 100644 index 000000000..7b6f5b523 --- /dev/null +++ b/man/ValidateSpecificClusters.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cluster_validation.R +\name{ValidateSpecificClusters} +\alias{ValidateSpecificClusters} +\title{Specific Cluster Validation} +\usage{ +ValidateSpecificClusters(object, cluster1 = NULL, cluster2 = 1, + pc.use = 2, top.genes = 30, acc.cutoff = 0.9) +} +\arguments{ +\item{object}{Seurat object} + +\item{cluster1}{First cluster to check classification} + +\item{cluster2}{Second cluster to check with classification} + +\item{pc.use}{Which PCs to use for model construction} + +\item{top.genes}{Use the top X genes for model construction} + +\item{acc.cutoff}{Accuracy cutoff for classifier} +} +\value{ +Returns a Seurat object, object@ident has been updated with +new cluster info +} +\description{ +Methods for validating the legitimacy of two specific clusters using +classification. SVMs are used as the basis for the classification. +Merging is done based on the connectivity from an SNN graph. +} diff --git a/man/VariableGenePlot.Rd b/man/VariableGenePlot.Rd new file mode 100644 index 000000000..3f2234056 --- /dev/null +++ b/man/VariableGenePlot.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{VariableGenePlot} +\alias{VariableGenePlot} +\title{View variable genes} +\usage{ +VariableGenePlot(object, do.text = TRUE, cex.use = 0.5, + cex.text.use = 0.5, do.spike = FALSE, pch.use = 16, col.use = "black", + spike.col.use = "red", plot.both = FALSE, do.contour = TRUE, + contour.lwd = 3, contour.col = "white", contour.lty = 2, + x.low.cutoff = 0.1, x.high.cutoff = 8, y.cutoff = 1, + y.high.cutoff = Inf) +} +\arguments{ +\item{object}{Seurat object} + +\item{do.text}{Add text names of variable genes to plot (default is TRUE)} + +\item{cex.use}{Point size} + +\item{cex.text.use}{Text size} + +\item{do.spike}{FALSE by default. If TRUE, color all genes starting with ^ERCC a different color} + +\item{pch.use}{Pch value for points} + +\item{col.use}{Color to use} + +\item{spike.col.use}{if do.spike, color for spike-in genes} + +\item{plot.both}{Plot both the scaled and non-scaled graphs.} + +\item{do.contour}{Draw contour lines calculated based on all genes} + +\item{contour.lwd}{Contour line width} + +\item{contour.col}{Contour line color} + +\item{contour.lty}{Contour line type} + +\item{x.low.cutoff}{Bottom cutoff on x-axis for identifying variable genes} + +\item{x.high.cutoff}{Top cutoff on x-axis for identifying variable genes} + +\item{y.cutoff}{Bottom cutoff on y-axis for identifying variable genes} + +\item{y.high.cutoff}{Top cutoff on y-axis for identifying variable genes} +} +\description{ +View variable genes +} diff --git a/man/VizClassification.Rd b/man/VizClassification.Rd new file mode 100644 index 000000000..b40dd588f --- /dev/null +++ b/man/VizClassification.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{VizClassification} +\alias{VizClassification} +\title{Highlight classification results} +\usage{ +VizClassification(object, clusters, ...) +} +\arguments{ +\item{object}{Seurat object on which the classifier was trained and +onto which the classification results will be highlighted} + +\item{clusters}{vector of cluster ids (output of ClassifyCells)} + +\item{...}{additional parameters to pass to FeaturePlot()} +} +\value{ +Returns a feature plot with clusters highlighted by proportion of cells +mapping to that cluster +} +\description{ +This function is useful to view where proportionally the clusters returned from +classification map to the clusters present in the given object. Utilizes the FeaturePlot() +function to color clusters in object. +} diff --git a/man/VizDimReduction.Rd b/man/VizDimReduction.Rd new file mode 100644 index 000000000..f291cb6ec --- /dev/null +++ b/man/VizDimReduction.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{VizDimReduction} +\alias{VizDimReduction} +\title{Visualize Dimensional Reduction genes} +\usage{ +VizDimReduction(object, reduction.type = "pca", dims.use = 1:5, + num.genes = 30, use.full = FALSE, font.size = 0.5, nCol = NULL, + do.balanced = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{reduction.type}{Reduction technique to visualize results for} + +\item{dims.use}{Number of dimensions to display} + +\item{num.genes}{Number of genes to display} + +\item{use.full}{Use reduction values for full dataset (i.e. projected dimensional reduction values)} + +\item{font.size}{Font size} + +\item{nCol}{Number of columns to display} + +\item{do.balanced}{Return an equal number of genes with + and - scores. If FALSE (default), returns +the top genes ranked by the scores absolute values} +} +\value{ +Graphical, no return value +} +\description{ +Visualize top genes associated with reduction components +} diff --git a/man/VizICA.Rd b/man/VizICA.Rd new file mode 100644 index 000000000..e81905720 --- /dev/null +++ b/man/VizICA.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{VizICA} +\alias{VizICA} +\title{Visualize ICA genes} +\usage{ +VizICA(object, ics.use = 1:5, num.genes = 30, use.full = FALSE, + font.size = 0.5, nCol = NULL, do.balanced = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{ics.use}{Number of ICs to display} + +\item{num.genes}{Number of genes to display} + +\item{use.full}{Use full ICA (i.e. the projected ICA, by default FALSE)} + +\item{font.size}{Font size} + +\item{nCol}{Number of columns to display} + +\item{do.balanced}{Return an equal number of genes with both + and - IC scores. +If FALSE (by default), returns the top genes ranked by the score's absolute values} +} +\value{ +Graphical, no return value +} +\description{ +Visualize top genes associated with principal components +} diff --git a/man/VizPCA.Rd b/man/VizPCA.Rd new file mode 100644 index 000000000..915a33719 --- /dev/null +++ b/man/VizPCA.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{VizPCA} +\alias{VizPCA} +\title{Visualize PCA genes} +\usage{ +VizPCA(object, pcs.use = 1:5, num.genes = 30, use.full = FALSE, + font.size = 0.5, nCol = NULL, do.balanced = FALSE) +} +\arguments{ +\item{object}{Seurat object} + +\item{pcs.use}{Number of PCs to display} + +\item{num.genes}{Number of genes to display} + +\item{use.full}{Use full PCA (i.e. the projected PCA, by default FALSE)} + +\item{font.size}{Font size} + +\item{nCol}{Number of columns to display} + +\item{do.balanced}{Return an equal number of genes with both + and - PC scores. +If FALSE (by default), returns the top genes ranked by the score's absolute values} +} +\value{ +Graphical, no return value +} +\description{ +Visualize top genes associated with principal components +} diff --git a/man/VlnPlot.Rd b/man/VlnPlot.Rd new file mode 100644 index 000000000..a0e4f4eb8 --- /dev/null +++ b/man/VlnPlot.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{VlnPlot} +\alias{VlnPlot} +\title{Single cell violin plot} +\usage{ +VlnPlot(object, features.plot, ident.include = NULL, nCol = NULL, + do.sort = FALSE, y.max = NULL, same.y.lims = FALSE, size.x.use = 16, + size.y.use = 16, size.title.use = 20, adjust.use = 1, + point.size.use = 1, cols.use = NULL, group.by = NULL, y.log = FALSE, + x.lab.rot = FALSE, y.lab.rot = FALSE, legend.position = "right", + single.legend = TRUE, remove.legend = FALSE, do.return = FALSE, + return.plotlist = FALSE, ...) +} +\arguments{ +\item{object}{Seurat object} + +\item{features.plot}{Features to plot (gene expression, metrics, PC scores, +anything that can be retreived by FetchData)} + +\item{ident.include}{Which classes to include in the plot (default is all)} + +\item{nCol}{Number of columns if multiple plots are displayed} + +\item{do.sort}{Sort identity classes (on the x-axis) by the average +expression of the attribute being potted} + +\item{y.max}{Maximum y axis value} + +\item{same.y.lims}{Set all the y-axis limits to the same values} + +\item{size.x.use}{X axis title font size} + +\item{size.y.use}{Y axis title font size} + +\item{size.title.use}{Main title font size} + +\item{adjust.use}{Adjust parameter for geom_violin} + +\item{point.size.use}{Point size for geom_violin} + +\item{cols.use}{Colors to use for plotting} + +\item{group.by}{Group (color) cells in different ways (for example, orig.ident)} + +\item{y.log}{plot Y axis on log scale} + +\item{x.lab.rot}{Rotate x-axis labels} + +\item{y.lab.rot}{Rotate y-axis labels} + +\item{legend.position}{Position the legend for the plot} + +\item{single.legend}{Consolidate legend the legend for all plots} + +\item{remove.legend}{Remove the legend from the plot} + +\item{do.return}{Return a ggplot2 object (default : FALSE)} + +\item{return.plotlist}{Return the list of individual plots instead of compiled plot.} + +\item{\dots}{additional parameters to pass to FetchData (for example, use.imputed, use.scaled, use.raw)} +} +\value{ +By default, no return, only graphical output. If do.return=TRUE, +returns a list of ggplot objects. +} +\description{ +Draws a violin plot of single cell data (gene expression, metrics, PC +scores, etc.) +} diff --git a/man/WhichCells.Rd b/man/WhichCells.Rd new file mode 100644 index 000000000..c090fffe5 --- /dev/null +++ b/man/WhichCells.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interaction.R +\name{WhichCells} +\alias{WhichCells} +\title{Identify cells matching certain criteria} +\usage{ +WhichCells(object, ident = NULL, ident.remove = NULL, cells.use = NULL, + subset.name = NULL, accept.low = -Inf, accept.high = Inf, + accept.value = NULL, max.cells.per.ident = Inf, random.seed = 1) +} +\arguments{ +\item{object}{Seurat object} + +\item{ident}{Identity classes to subset. Default is all identities.} + +\item{ident.remove}{Indentity classes to remove. Default is NULL.} + +\item{cells.use}{Subset of cell names} + +\item{subset.name}{Parameter to subset on. Eg, the name of a gene, PC1, a +column name in object@meta.data, etc. Any argument that can be retreived +using FetchData} + +\item{accept.low}{Low cutoff for the parameter (default is -Inf)} + +\item{accept.high}{High cutoff for the parameter (default is Inf)} + +\item{accept.value}{Returns all cells with the subset name equal to this value} + +\item{max.cells.per.ident}{Can be used to downsample the data to a certain max per cell ident. Default is inf.} + +\item{random.seed}{Random seed for downsampling} +} +\value{ +A vector of cell names +} +\description{ +Returns a list of cells that match a particular set of criteria such as +identity class, high/low values for particular PCs, ect.. +} diff --git a/man/seurat.Rd b/man/seurat.Rd new file mode 100644 index 000000000..201afffee --- /dev/null +++ b/man/seurat.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/seurat.R +\docType{class} +\name{seurat} +\alias{seurat} +\alias{seurat-class} +\title{The Seurat Class} +\description{ +The Seurat object is the center of each single cell analysis. It stores all information +associated with the dataset, including data, annotations, analyes, etc. All that is needed +to construct a Seurat object is an expression matrix (rows are genes, columns are cells), which +should be log-scale +} +\details{ +Each Seurat object has a number of slots which store information. Key slots to access +are listed below. +} +\section{Slots}{ + + \describe{ + \item{\code{raw.data}:}{\code{"ANY"}, The raw project data } + \item{\code{data}:}{\code{"ANY"}, The expression matrix (log-scale) } + \item{\code{scale.data}:}{\code{"ANY"}, The scaled (after z-scoring + each gene) expression matrix. Used for PCA, ICA, and heatmap plotting} + \item{\code{var.genes}:}{\code{"vector"}, Variable genes across single cells } + \item{\code{is.expr}:}{\code{"numeric"}, Expression threshold to determine if a gene is expressed } + \item{\code{ident}:}{\code{"factor"}, The 'identity class' for each single cell } + \item{\code{meta.data}:}{\code{"data.frame"}, Contains information about metadata each cell, starting with # of genes detected (nGene) + the original identity class (orig.ident), user-provided information (through AddMetaData), etc. } + \item{\code{project.name}:}{\code{"character"}, Name of the project (for record keeping) } + \item{\code{dr}:}{\code{"list"}, List of stored dimensional reductions. Named by technique } + \item{\code{assay}:}{\code{"list"}, List of additional assays for multimodal analysis. Named by technique } + \item{\code{hvg.info}:}{\code{"data.frame"}, The output of the mean/variability analysis for all genes } + \item{\code{imputed}:}{\code{"data.frame"}, Matrix of imputed gene scores } + \item{\code{cell.names}:}{\code{"vector"}, Names of all single cells (column names of the expression matrix) } + \item{\code{cluster.tree}:}{\code{"list"}, List where the first element is a phylo object containing the + phylogenetic tree relating different identity classes } + \item{\code{snn}:}{\code{"dgCMatrix"}, Sparse matrix object representation of the SNN graph } + \item{\code{calc.params}:}{\code{"list"}, Named list to store all calculation related parameters choices} + \item{\code{kmeans}:}{\code{"ANY"}, Stores output of gene-based clustering from DoKMeans} + \item{\code{spatial}:}{\code{"ANY"},Stores internal data and calculations for spatial mapping of single cells} + \item{\code{misc}:}{\code{"ANY"}, Miscellaneous spot to store any data alongisde the object (for example, gene lists)} + \item{\code{version}:}{\code{"ANY"}, Version of package used in object creation} +} +} + diff --git a/man/situ3d.Rd b/man/situ3d.Rd new file mode 100644 index 000000000..76c817237 --- /dev/null +++ b/man/situ3d.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/zfRenderSeurat.R +\name{situ3d} +\alias{situ3d} +\title{Draw 3D in situ predictions from Zebrafish dataset} +\usage{ +situ3d(data, label = NULL, ...) +} +\arguments{ +\item{data}{Predicted expression levels across Zebrafish bins} + +\item{label}{Plot label} +} +\description{ +From Jeff Farrell +} diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 000000000..214f8c0e2 --- /dev/null +++ b/src/Makevars @@ -0,0 +1,4 @@ +PKG_CXXFLAGS = -I../inst/include +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) +CXX_STD = CXX11 +PKG_CXXFLAGS = -g -std=c++11 -Wall -pedantic diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 000000000..df652c593 --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,175 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include +#include + +using namespace Rcpp; + +// RunUMISampling +Eigen::SparseMatrix RunUMISampling(Eigen::SparseMatrix data, int sample_val, bool upsample, bool display_progress); +RcppExport SEXP _Seurat_RunUMISampling(SEXP dataSEXP, SEXP sample_valSEXP, SEXP upsampleSEXP, SEXP display_progressSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::SparseMatrix >::type data(dataSEXP); + Rcpp::traits::input_parameter< int >::type sample_val(sample_valSEXP); + Rcpp::traits::input_parameter< bool >::type upsample(upsampleSEXP); + Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); + rcpp_result_gen = Rcpp::wrap(RunUMISampling(data, sample_val, upsample, display_progress)); + return rcpp_result_gen; +END_RCPP +} +// RunUMISamplingPerCell +Eigen::SparseMatrix RunUMISamplingPerCell(Eigen::SparseMatrix data, NumericVector sample_val, bool upsample, bool display_progress); +RcppExport SEXP _Seurat_RunUMISamplingPerCell(SEXP dataSEXP, SEXP sample_valSEXP, SEXP upsampleSEXP, SEXP display_progressSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::SparseMatrix >::type data(dataSEXP); + Rcpp::traits::input_parameter< NumericVector >::type sample_val(sample_valSEXP); + Rcpp::traits::input_parameter< bool >::type upsample(upsampleSEXP); + Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); + rcpp_result_gen = Rcpp::wrap(RunUMISamplingPerCell(data, sample_val, upsample, display_progress)); + return rcpp_result_gen; +END_RCPP +} +// RowMergeMatrices +Eigen::SparseMatrix RowMergeMatrices(Eigen::SparseMatrix mat1, Eigen::SparseMatrix mat2, std::vector< std::string > mat1_rownames, std::vector< std::string > mat2_rownames, std::vector< std::string > all_rownames); +RcppExport SEXP _Seurat_RowMergeMatrices(SEXP mat1SEXP, SEXP mat2SEXP, SEXP mat1_rownamesSEXP, SEXP mat2_rownamesSEXP, SEXP all_rownamesSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::SparseMatrix >::type mat1(mat1SEXP); + Rcpp::traits::input_parameter< Eigen::SparseMatrix >::type mat2(mat2SEXP); + Rcpp::traits::input_parameter< std::vector< std::string > >::type mat1_rownames(mat1_rownamesSEXP); + Rcpp::traits::input_parameter< std::vector< std::string > >::type mat2_rownames(mat2_rownamesSEXP); + Rcpp::traits::input_parameter< std::vector< std::string > >::type all_rownames(all_rownamesSEXP); + rcpp_result_gen = Rcpp::wrap(RowMergeMatrices(mat1, mat2, mat1_rownames, mat2_rownames, all_rownames)); + return rcpp_result_gen; +END_RCPP +} +// LogNorm +Eigen::SparseMatrix LogNorm(Eigen::SparseMatrix data, int scale_factor, bool display_progress); +RcppExport SEXP _Seurat_LogNorm(SEXP dataSEXP, SEXP scale_factorSEXP, SEXP display_progressSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::SparseMatrix >::type data(dataSEXP); + Rcpp::traits::input_parameter< int >::type scale_factor(scale_factorSEXP); + Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); + rcpp_result_gen = Rcpp::wrap(LogNorm(data, scale_factor, display_progress)); + return rcpp_result_gen; +END_RCPP +} +// FastMatMult +Eigen::MatrixXd FastMatMult(Eigen::MatrixXd m1, Eigen::MatrixXd m2); +RcppExport SEXP _Seurat_FastMatMult(SEXP m1SEXP, SEXP m2SEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type m1(m1SEXP); + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type m2(m2SEXP); + rcpp_result_gen = Rcpp::wrap(FastMatMult(m1, m2)); + return rcpp_result_gen; +END_RCPP +} +// FastRowScale +Eigen::MatrixXd FastRowScale(Eigen::MatrixXd mat, bool scale, bool center, double scale_max, bool display_progress); +RcppExport SEXP _Seurat_FastRowScale(SEXP matSEXP, SEXP scaleSEXP, SEXP centerSEXP, SEXP scale_maxSEXP, SEXP display_progressSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type mat(matSEXP); + Rcpp::traits::input_parameter< bool >::type scale(scaleSEXP); + Rcpp::traits::input_parameter< bool >::type center(centerSEXP); + Rcpp::traits::input_parameter< double >::type scale_max(scale_maxSEXP); + Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); + rcpp_result_gen = Rcpp::wrap(FastRowScale(mat, scale, center, scale_max, display_progress)); + return rcpp_result_gen; +END_RCPP +} +// Standardize +Eigen::MatrixXd Standardize(Eigen::MatrixXd mat, bool display_progress); +RcppExport SEXP _Seurat_Standardize(SEXP matSEXP, SEXP display_progressSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type mat(matSEXP); + Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); + rcpp_result_gen = Rcpp::wrap(Standardize(mat, display_progress)); + return rcpp_result_gen; +END_RCPP +} +// FastSparseRowScale +Eigen::MatrixXd FastSparseRowScale(Eigen::SparseMatrix mat, bool scale, bool center, double scale_max, bool display_progress); +RcppExport SEXP _Seurat_FastSparseRowScale(SEXP matSEXP, SEXP scaleSEXP, SEXP centerSEXP, SEXP scale_maxSEXP, SEXP display_progressSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::SparseMatrix >::type mat(matSEXP); + Rcpp::traits::input_parameter< bool >::type scale(scaleSEXP); + Rcpp::traits::input_parameter< bool >::type center(centerSEXP); + Rcpp::traits::input_parameter< double >::type scale_max(scale_maxSEXP); + Rcpp::traits::input_parameter< bool >::type display_progress(display_progressSEXP); + rcpp_result_gen = Rcpp::wrap(FastSparseRowScale(mat, scale, center, scale_max, display_progress)); + return rcpp_result_gen; +END_RCPP +} +// FastCov +Eigen::MatrixXd FastCov(Eigen::MatrixXd mat, bool center); +RcppExport SEXP _Seurat_FastCov(SEXP matSEXP, SEXP centerSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type mat(matSEXP); + Rcpp::traits::input_parameter< bool >::type center(centerSEXP); + rcpp_result_gen = Rcpp::wrap(FastCov(mat, center)); + return rcpp_result_gen; +END_RCPP +} +// FastCovMats +Eigen::MatrixXd FastCovMats(Eigen::MatrixXd mat1, Eigen::MatrixXd mat2, bool center); +RcppExport SEXP _Seurat_FastCovMats(SEXP mat1SEXP, SEXP mat2SEXP, SEXP centerSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type mat1(mat1SEXP); + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type mat2(mat2SEXP); + Rcpp::traits::input_parameter< bool >::type center(centerSEXP); + rcpp_result_gen = Rcpp::wrap(FastCovMats(mat1, mat2, center)); + return rcpp_result_gen; +END_RCPP +} +// FastRBind +Eigen::MatrixXd FastRBind(Eigen::MatrixXd mat1, Eigen::MatrixXd mat2); +RcppExport SEXP _Seurat_FastRBind(SEXP mat1SEXP, SEXP mat2SEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type mat1(mat1SEXP); + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type mat2(mat2SEXP); + rcpp_result_gen = Rcpp::wrap(FastRBind(mat1, mat2)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_Seurat_RunUMISampling", (DL_FUNC) &_Seurat_RunUMISampling, 4}, + {"_Seurat_RunUMISamplingPerCell", (DL_FUNC) &_Seurat_RunUMISamplingPerCell, 4}, + {"_Seurat_RowMergeMatrices", (DL_FUNC) &_Seurat_RowMergeMatrices, 5}, + {"_Seurat_LogNorm", (DL_FUNC) &_Seurat_LogNorm, 3}, + {"_Seurat_FastMatMult", (DL_FUNC) &_Seurat_FastMatMult, 2}, + {"_Seurat_FastRowScale", (DL_FUNC) &_Seurat_FastRowScale, 5}, + {"_Seurat_Standardize", (DL_FUNC) &_Seurat_Standardize, 2}, + {"_Seurat_FastSparseRowScale", (DL_FUNC) &_Seurat_FastSparseRowScale, 5}, + {"_Seurat_FastCov", (DL_FUNC) &_Seurat_FastCov, 2}, + {"_Seurat_FastCovMats", (DL_FUNC) &_Seurat_FastCovMats, 3}, + {"_Seurat_FastRBind", (DL_FUNC) &_Seurat_FastRBind, 2}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_Seurat(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/data_manipulation.cpp b/src/data_manipulation.cpp new file mode 100644 index 000000000..88636db4f --- /dev/null +++ b/src/data_manipulation.cpp @@ -0,0 +1,259 @@ +#include +#include +#include +#include + +using namespace Rcpp; +// [[Rcpp::depends(RcppEigen)]] +// [[Rcpp::depends(RcppProgress)]] + + + +// [[Rcpp::export]] +Eigen::SparseMatrix RunUMISampling(Eigen::SparseMatrix data, int sample_val, bool upsample = false, bool display_progress=true){ + Progress p(data.outerSize(), display_progress); + Eigen::VectorXd colSums = data.transpose() * Eigen::VectorXd::Ones(data.cols()); + for (int k=0; k < data.outerSize(); ++k){ + p.increment(); + for (Eigen::SparseMatrix::InnerIterator it(data, k); it; ++it){ + double entry = it.value(); + if( (upsample) || (colSums[k] > sample_val)){ + entry = entry * double(sample_val) / colSums[k]; + if (fmod(entry, 1) != 0){ + double rn = runif(1)[0]; + if(fmod(entry, 1) <= rn){ + data.coeffRef(it.row(), it.col()) = floor(entry); + } + else{ + data.coeffRef(it.row(), it.col()) = ceil(entry); + } + } + else{ + data.coeffRef(it.row(), it.col()) = entry; + } + } + } + } + return(data); +} + +// [[Rcpp::export]] +Eigen::SparseMatrix RunUMISamplingPerCell(Eigen::SparseMatrix data, NumericVector sample_val, bool upsample = false, bool display_progress=true){ + Progress p(data.outerSize(), display_progress); + Eigen::VectorXd colSums = data.transpose() * Eigen::VectorXd::Ones(data.cols()); + for (int k=0; k < data.outerSize(); ++k){ + p.increment(); + for (Eigen::SparseMatrix::InnerIterator it(data, k); it; ++it){ + double entry = it.value(); + if( (upsample) || (colSums[k] > sample_val[k])){ + entry = entry * double(sample_val[k]) / colSums[k]; + if (fmod(entry, 1) != 0){ + double rn = runif(1)[0]; + if(fmod(entry, 1) <= rn){ + data.coeffRef(it.row(), it.col()) = floor(entry); + } + else{ + data.coeffRef(it.row(), it.col()) = ceil(entry); + } + } + else{ + data.coeffRef(it.row(), it.col()) = entry; + } + } + } + } + return(data); +} + + +typedef Eigen::Triplet T; +// [[Rcpp::export]] +Eigen::SparseMatrix RowMergeMatrices(Eigen::SparseMatrix mat1, Eigen::SparseMatrix mat2, std::vector< std::string > mat1_rownames, + std::vector< std::string > mat2_rownames, std::vector< std::string > all_rownames){ + + + // Set up hash maps for rowname based lookup + std::unordered_map mat1_map; + for(int i = 0; i < mat1_rownames.size(); i++){ + mat1_map[mat1_rownames[i]] = i; + } + std::unordered_map mat2_map; + for(int i = 0; i < mat2_rownames.size(); i++){ + mat2_map[mat2_rownames[i]] = i; + } + + // set up tripletList for new matrix creation + std::vector tripletList; + int num_rows = all_rownames.size(); + int num_col1 = mat1.cols(); + int num_col2 = mat2.cols(); + + + tripletList.reserve(mat1.nonZeros() + mat2.nonZeros()); + for(int i = 0; i < num_rows; i++){ + std::string key = all_rownames[i]; + if (mat1_map.count(key)){ + for(Eigen::SparseMatrix::InnerIterator it1(mat1, mat1_map[key]); it1; ++it1){ + tripletList.push_back(T(i, it1.col(), it1.value())); + } + } + if (mat2_map.count(key)){ + for(Eigen::SparseMatrix::InnerIterator it2(mat2, mat2_map[key]); it2; ++it2){ + tripletList.push_back(T(i, num_col1 + it2.col(), it2.value())); + } + } + } + Eigen::SparseMatrix combined_mat(num_rows, num_col1 + num_col2); + combined_mat.setFromTriplets(tripletList.begin(), tripletList.end()); + return combined_mat; +} + +// [[Rcpp::export]] +Eigen::SparseMatrix LogNorm(Eigen::SparseMatrix data, int scale_factor, bool display_progress = true){ + Progress p(data.outerSize(), display_progress); + Eigen::VectorXd colSums = data.transpose() * Eigen::VectorXd::Ones(data.cols()); + for (int k=0; k < data.outerSize(); ++k){ + p.increment(); + for (Eigen::SparseMatrix::InnerIterator it(data, k); it; ++it){ + data.coeffRef(it.row(), it.col()) = log1p(double(it.value()) / colSums[k] * scale_factor); + } + } + return data; +} + +// [[Rcpp::export]] +Eigen::MatrixXd FastMatMult(Eigen::MatrixXd m1, Eigen::MatrixXd m2){ + Eigen::MatrixXd m3 = m1 * m2; + return(m3); +} + + +/* Performs row scaling and/or centering. Equivalent to using t(scale(t(mat))) in R. + Note: Doesn't handle NA/NaNs in the same way the R implementation does, */ + +// [[Rcpp::export]] +Eigen::MatrixXd FastRowScale(Eigen::MatrixXd mat, bool scale = true, bool center = true, + double scale_max = 10, bool display_progress = true){ + Progress p(mat.rows(), display_progress); + Eigen::MatrixXd scaled_mat(mat.rows(), mat.cols()); + for(int i=0; i < mat.rows(); ++i){ + p.increment(); + Eigen::ArrayXd r = mat.row(i).array(); + double rowMean = r.mean(); + double rowSdev = 1; + if(scale == true){ + if(center == true){ + rowSdev = sqrt((r - rowMean).square().sum() / (mat.cols() - 1)); + } + else{ + rowSdev = sqrt(r.square().sum() / (mat.cols() - 1)); + } + } + if(center == false){ + rowMean = 0; + } + scaled_mat.row(i) = (r - rowMean) / rowSdev; + for(int s=0; s scale_max){ + scaled_mat(i, s) = scale_max; + } + } + } + return scaled_mat; +} + +/* Performs column scaling and/or centering. Equivalent to using scale(mat, TRUE, apply(x,2,sd)) in R. + Note: Doesn't handle NA/NaNs in the same way the R implementation does, */ + +// [[Rcpp::export]] +Eigen::MatrixXd Standardize(Eigen::MatrixXd mat, bool display_progress = true){ + Progress p(mat.cols(), display_progress); + Eigen::MatrixXd std_mat(mat.rows(), mat.cols()); + for(int i=0; i < mat.cols(); ++i){ + p.increment(); + Eigen::ArrayXd r = mat.col(i).array(); + double colMean = r.mean(); + double colSdev = sqrt((r - colMean).square().sum() / (mat.rows() - 1)); + std_mat.col(i) = (r - colMean) / colSdev; + } + return std_mat; +} + +// [[Rcpp::export]] +Eigen::MatrixXd FastSparseRowScale(Eigen::SparseMatrix mat, bool scale = true, bool center = true, + double scale_max = 10, bool display_progress = true){ + mat = mat.transpose(); + Progress p(mat.outerSize(), display_progress); + Eigen::MatrixXd scaled_mat(mat.rows(), mat.cols()); + for (int k=0; k::InnerIterator it(mat,k); it; ++it) + { + colMean += it.value(); + } + colMean = colMean / mat.rows(); + if (scale == true){ + int nnZero = 0; + if(center == true){ + for (Eigen::SparseMatrix::InnerIterator it(mat,k); it; ++it) + { + nnZero += 1; + colSdev += pow((it.value() - colMean), 2); + } + colSdev += pow(colMean, 2) * (mat.rows() - nnZero); + } + else{ + for (Eigen::SparseMatrix::InnerIterator it(mat,k); it; ++it) + { + colSdev += pow(it.value(), 2); + } + } + colSdev = sqrt(colSdev / (mat.rows() - 1)); + } + else{ + colSdev = 1; + } + if(center == false){ + colMean = 0; + } + Eigen::VectorXd col = Eigen::VectorXd(mat.col(k)); + scaled_mat.col(k) = (col.array() - colMean) / colSdev; + for(int s=0; s scale_max){ + scaled_mat(s,k) = scale_max; + } + } + } + return scaled_mat.transpose(); +} + +/* Note: May not handle NA/NaNs in the same way the R implementation does, */ + +// [[Rcpp::export]] +Eigen::MatrixXd FastCov(Eigen::MatrixXd mat, bool center = true){ + if (center) { + mat = mat.rowwise() - mat.colwise().mean(); + } + Eigen::MatrixXd cov = (mat.adjoint() * mat) / double(mat.rows() - 1); + return(cov); +} + +// [[Rcpp::export]] +Eigen::MatrixXd FastCovMats(Eigen::MatrixXd mat1, Eigen::MatrixXd mat2, bool center = true){ + if(center){ + mat1 = mat1.rowwise() - mat1.colwise().mean(); + mat2 = mat2.rowwise() - mat2.colwise().mean(); + } + Eigen::MatrixXd cov = (mat1.adjoint() * mat2) / double(mat1.rows() - 1); + return(cov); +} + +/* Note: Faster than the R implementation but is not in-place */ +//[[Rcpp::export]] +Eigen::MatrixXd FastRBind(Eigen::MatrixXd mat1, Eigen::MatrixXd mat2){ + Eigen::MatrixXd mat3(mat1.rows() + mat2.rows(), mat1.cols()); + mat3 << mat1, mat2; + return(mat3); +} \ No newline at end of file diff --git a/src/data_manipulation.h b/src/data_manipulation.h new file mode 100644 index 000000000..00d787091 --- /dev/null +++ b/src/data_manipulation.h @@ -0,0 +1,35 @@ +#ifndef DATA_MANIPULATION +#define DATA_MANIPULATION + +#include +#include +#include +#include + +using namespace Rcpp; + +//---------------------------------------------------- +Eigen::SparseMatrix RunUMISampling(Eigen::SparseMatrix data, int sample_val, + bool upsample, bool display_progress); +Eigen::SparseMatrix RunUMISamplingPerCell(Eigen::SparseMatrix data, + NumericVector sample_val, bool upsample, + bool display_progress); +Eigen::SparseMatrix RowMergeMatrices(Eigen::SparseMatrix mat1, + Eigen::SparseMatrix mat2, + std::vector< std::string > mat1_rownames, + std::vector< std::string > mat2_rownames, + std::vector< std::string > all_rownames); +Eigen::SparseMatrix LogNorm(Eigen::SparseMatrix data, int scale_factor, + bool display_progress ); +Eigen::MatrixXd FastMatMult(Eigen::MatrixXd m1, Eigen::MatrixXd m2); +Eigen::MatrixXd FastRowScale(Eigen::MatrixXd mat, bool scale, bool center, double scale_max, + bool display_progress); +Eigen::MatrixXd FastSparseRowScale(Eigen::SparseMatrix mat, bool scale, bool center, + double scale_max, bool display_progress); +Eigen::MatrixXd FastCov(Eigen::MatrixXd mat, bool center); +Eigen::MatrixXd FastCovMats(Eigen::MatrixXd mat1, Eigen::MatrixXd mat2, bool center); +Eigen::MatrixXd Standardize(Eigen::MatrixXd mat, bool display_progress); +Eigen::MatrixXd FastRBind(Eigen::MatrixXd mat1, Eigen::MatrixXd mat2); +//---------------------------------------------------- + +#endif//DATA_MANIPULATION \ No newline at end of file diff --git a/tests/testdata/nbt_small.Rdata b/tests/testdata/nbt_small.Rdata new file mode 100644 index 000000000..1edc20f0c Binary files /dev/null and b/tests/testdata/nbt_small.Rdata differ diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 000000000..72a15ce4b --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(Seurat) + +test_check("Seurat") diff --git a/tests/testthat/test_data_manipulation.R b/tests/testthat/test_data_manipulation.R new file mode 100644 index 000000000..732cd0ae0 --- /dev/null +++ b/tests/testthat/test_data_manipulation.R @@ -0,0 +1,111 @@ +# Tests for functions in data_manipulation.cpp +set.seed(42) +library(Matrix) + +# Tests for row merging +# -------------------------------------------------------------------------------- +context("Row Merging") + +m1 <- rsparsematrix(10, 10, 0.1) +m2 <- rsparsematrix(10, 10, 0.1) +m1.names <- paste0("row", sample(1:10, size = 10)) +m2.names <- paste0("row", sample(1:20, size = 10)) +all.names <- union(m1.names, m2.names) +rownames(m1) <- m1.names +rownames(m2) <- m2.names +m1 <- as(m1, "RsparseMatrix") +m2 <- as(m2, "RsparseMatrix") + +test_that("Row merging done correctly", { + m3 <- RowMergeMatrices(mat1 = m1, mat2 = m2, mat1_rownames = m1.names, mat2_rownames = m2.names, + all_rownames = all.names) + expect_equal(m3[1, 14], -0.17) + expect_equal(m3[3, 2], -1.4) + expect_equal(m3[14, 18], -0.43) + expect_equal(length(m3), 280) +}) + +# Tests for log normalization +# -------------------------------------------------------------------------------- +context("Log Normalization") + +mat <- as(matrix(1:16, ncol = 4, nrow = 4), "sparseMatrix") + +test_that("Log Normalization returns expected values", { + mat.norm.r <- log1p(sweep(mat, 2, Matrix::colSums(mat), FUN = "/") * 1e4) + mat.norm <- LogNorm(mat, 1e4, display_progress = F) + expect_equal(mat.norm[1, ], mat.norm.r[1, ]) + expect_equal(mat.norm[4, 4], mat.norm.r[4, 4]) +}) + +# Tests for matrix multiply +# -------------------------------------------------------------------------------- +context("Matrix Multiply") + +mat <- as.matrix(mat) + +test_that("Fast implementation of matrix multiply returns as expected", { + expect_equal(mat %*% mat, FastMatMult(mat, mat)) + mat[1, 1] <- NA + expect_equal(mat %*% mat, FastMatMult(mat, mat)) + mat[1, 1] <- NaN + expect_equal(mat %*% mat, FastMatMult(mat, mat)) +}) + +# Tests for scaling data +# -------------------------------------------------------------------------------- +context("Fast Scale Data Functions") + +mat <- matrix(seq(0.001, 0.1, 0.001), nrow = 10, ncol = 10) + +# should be the equivalent of t(scale(t(mat))) +test_that("Fast implementation of row scaling returns expected values", { + expect_equal(t(scale(t(mat))[1:10, 1:10]), FastRowScale(mat, display_progress = FALSE)) + expect_equal(t(scale(t(mat), center = FALSE))[1:10, 1:10], + FastRowScale(mat, center = FALSE, display_progress = FALSE)) + expect_equal(t(scale(t(mat), scale = FALSE))[1:10, 1:10], + FastRowScale(mat, scale = FALSE, display_progress = FALSE)) + expect_equal(t(scale(t(mat), scale = FALSE, center = F))[1:10, 1:10], + FastRowScale(mat, scale = FALSE, center = F, display_progress = FALSE)) +}) + +# should be the equivalent of scale(mat, TRUE, apply(mat, 2, sd)) +test_that("Standardize returns expected values", { + expect_equal(Standardize(mat, display_progress = FALSE), scale(mat, TRUE, apply(mat, 2, sd)), + check.attributes = FALSE) +}) + +# should be the equivalent of t(scale(t(mat))) +mat <- rsparsematrix(10, 15, 0.1) +test_that("Fast implementation of row scaling returns expected values", { + expect_equal(t(scale(t(as.matrix(mat))))[1:10, 1:15], FastSparseRowScale(mat, display_progress = FALSE), + check.attributes = FALSE) + expect_equal(t(scale(t(as.matrix(mat)), center = FALSE))[1:10, 1:15], + FastSparseRowScale(mat, center = FALSE, display_progress = FALSE), + check.attributes = FALSE) + expect_equal(t(scale(t(as.matrix(mat)), scale = FALSE))[1:10, 1:15], + FastSparseRowScale(mat, scale = FALSE, display_progress = FALSE), + check.attributes = FALSE) + expect_equal(t(scale(t(as.matrix(mat)), scale = FALSE, center = F))[1:10, 1:15], + FastSparseRowScale(mat, scale = FALSE, center = F, display_progress = FALSE), + check.attributes = FALSE) +}) + +# Tests for fast basic stats functions +# -------------------------------------------------------------------------------- +set.seed(42) +mat <- replicate(10, rchisq(10, 4)) +fcv <- FastCov(mat) +cv <- cov(mat) +test_that("Fast implementation of covariance returns expected values", { + expect_equal(fcv[1,1], 9.451051142) + expect_equal(fcv[10,10], 5.6650068) + expect_equal(fcv, cv) +}) + +merged.mat <- FastRBind(mat, fcv) +test_that("Fast implementation of rbind returns expected values", { + expect_equal(merged.mat, rbind(mat, fcv)) + expect_equal(mat[1,1], merged.mat[1,1]) + expect_equal(fcv[10,10], merged.mat[20,10]) +}) diff --git a/tests/testthat/test_seurat_object.R b/tests/testthat/test_seurat_object.R new file mode 100644 index 000000000..7946e741a --- /dev/null +++ b/tests/testthat/test_seurat_object.R @@ -0,0 +1,211 @@ +# Tests for functions dependent on a seurat object +set.seed(42) + +# load a minimal example data set (subset of nbt dataset) +load("../testdata/nbt_small.Rdata") +nbt.small <- log(nbt.small + 1) + + +# Test Initial Normalization +expect_equal(LogNormalize(matrix(1:16, nrow = 4))[1,1], 6.908755, tolerance = 1e-6) + +# Tests for object creation (via new/Setup) +# -------------------------------------------------------------------------------- +context("Object creation") + +# Generate Seurat object +min.cells <- 3 +project.name <- "nbt.test" +names.field <- 2 +names.delim <- "_" +min.genes <- 1000 +expression.thresh <- 1 + +nbt.test <- new("seurat", raw.data = nbt.small) + +nbt.test <- CreateSeuratObject(raw.data = nbt.small, + project = project.name, + min.cells = min.cells, + names.field = names.field, + names.delim = names.delim, + min.genes = min.genes, + is.expr = expression.thresh, + do.scale = T, + do.center = T) + +test_that("object initialization creates seurat object", { + expect_is(nbt.test, "seurat") +}) + +test_that("entered parameters set correctly", { + expect_match(project.name, nbt.test@project.name) + expect_equal(expression.thresh, nbt.test@is.expr) +}) +test_that("correct cells are used",{ + gene.count <- nbt.test@meta.data$nGene + expect_equal(min(gene.count), 2405) + expect_true(all(gene.count >= min.genes)) +}) + +test_that("correct genes are used", { + useable.genes <- rowSums(nbt.test@raw.data > expression.thresh) + useable.genes <- useable.genes[useable.genes >= min.cells] + used.genes <- rownames(nbt.test@data) + + expect_true(length(useable.genes) > 0) + expect_equal(length(useable.genes), length(used.genes)) +}) + +test_that("names and IDs set correctly", { + expect_true(length(colnames(nbt.test@raw.data)) > 0) + expect_equal(nbt.test@cell.names, colnames(nbt.test@raw.data)) + + expected.cluster.ids = c("GW21.2", "GW16", "GW21") + expect_equal(as.vector(unique(nbt.test@ident)), expected.cluster.ids) + expect_equal(as.vector(unique(nbt.test@ident)), as.vector(unique(nbt.test@meta.data$orig.ident))) + +}) + +test_that("scaling done correctly", { + expect_equal(nbt.test@scale.data["AACS", "Hi_GW21.2_3"], 1.6771640694) + expect_equal(nbt.test@scale.data["ZYX", "Hi_GW16_1"], -0.61829233) +}) + +# Test dimensional reduction +# -------------------------------------------------------------------------------- +context("PCA dimensional reduction") + +nbt.test <- FindVariableGenes( + nbt.test, + y.cutoff = 2, + x.low.cutoff = 2, + mean.function = ExpMean, + dispersion.function = LogVMR +) + +pcs.compute <- 3 +nbt.test <- RunPCA(nbt.test, pcs.compute = pcs.compute, do.print = FALSE, weight.by.var = F) + +test_that("PCA returns expected data when not scaling", { + expect_equal(abs(nbt.test@dr$pca@cell.embeddings[1,1]), 0.26627994, tolerance = 1e-6) + expect_equal(abs(nbt.test@dr$pca@gene.loadings[1,1]), 0.5261299, tolerance = 1e-6) + expect_equal(ncol(nbt.test@dr$pca@gene.loadings), pcs.compute) + expect_equal(ncol(nbt.test@dr$pca@cell.embeddings), pcs.compute) + +}) + +nbt.test <- RunPCA(nbt.test, pcs.compute = pcs.compute, do.print = FALSE) +test_that("PCA returns expected data when scaling by variance explained", { + expect_true(nrow(nbt.test@dr$pca@cell.embeddings) == ncol(nbt.test@data)) + expect_true(nrow(nbt.test@dr$pca@gene.loadings) == length(nbt.test@var.genes)) + expect_equal(abs(nbt.test@dr$pca@cell.embeddings[1,1]), 1.423131, tolerance = 1e-6) + expect_equal(abs(nbt.test@dr$pca@gene.loadings[1,1]), 0.5261299, tolerance = 1e-6 ) +}) + +# Tests for tSNE +# -------------------------------------------------------------------------------- +context("tSNE") +nbt.test <- RunTSNE(nbt.test, dims.use = 1:2, do.fast = T, perplexity = 4) +test_that("tSNE is run correctly", { + expect_equal(nrow(nbt.test@dr$tsne@cell.embeddings), ncol(nbt.test@data)) + expect_equal(unname(nbt.test@dr$tsne@cell.embeddings[1, 1]), 8.958629, tolerance = 1e-6) +}) + +test_that("tSNE plots correctly", { + p <- TSNEPlot(nbt.test) + expect_is(p, "list") + expect_equal(length(unique(p[[1]][[1]]$group)), 3) +}) + +# Tests for plotting functionality (via Setup) +# -------------------------------------------------------------------------------- +context("Plotting/Visualization") + +test_that("Violin plots (VlnPlot() ) return as expected", { + expect_is(VlnPlot(nbt.test, "ZYX", do.ret = T)$layers[[1]]$geom, "GeomViolin" ) + expect_equal(length(VlnPlot(nbt.test, c("ZYX", "AACS"), do.return = T, return.plotlist = T)), 2) + +}) + +test_that("CellPlots return as expected", { + expect_equal(CellPlot(nbt.test, nbt.test@cell.names[1], nbt.test@cell.names[2]), NULL) +}) + +test_that("GenePlots return as expected", { + expect_equal(GenePlot(nbt.test,"DLX1","DLX2"), NULL) +}) + +test_that("FeaturePlot works as expected", { + expect_is(FeaturePlot(nbt.test, "DLX1"), "NULL") + plot <- FeaturePlot(nbt.test, c("DLX1", "nGene"), do.return = T) + expect_is(plot, "list") + expect_equal(length(plot), 2) + expect_is(FeaturePlot(nbt.test, "DLX1", cols.use = "Purples"), "NULL") +}) + + +# Tests for clustering related functions +# -------------------------------------------------------------------------------- +context("Clustering Functions") + +test_that("SNN calculations are correct and handled properly", { + expect_true(length(nbt.test@snn) == 0) + + nbt.test <- FindClusters(nbt.test, dims.use = 1:2, print.output = 0, k.param = 4, k.scale = 1, save.SNN = T) + expect_true(length(nbt.test@snn) > 1) + expect_equal(nbt.test@snn[2,9], 1/3) + + nbt.test <- FindClusters(nbt.test, resolution = 1, print.output = 0) + + expect_warning(FindClusters(nbt.test, k.param = 4, reuse.SNN = T, resolution = 1, n.iter = 1, n.start = 1, print.output = 0)) + nbt.test@snn <- sparseMatrix(1, 1, x = 1) + expect_error(FindClusters(nbt.test, resolution = 1, reuse.SNN = T)) + +}) +nbt.test <- FindClusters(nbt.test, k.param = 4, resolution = seq(1,2,0.1), print.output = 0, n.iter = 1, + n.start = 1) +test_that("Clustering over multiple resolution values handled correctly", { + expect_equal(length(nbt.test@meta.data$res.1), ncol(nbt.test@data)) + expect_equal(length(nbt.test@meta.data$res.2), ncol(nbt.test@data)) + expect_equal(length(nbt.test@snn), 1) +}) + +# Test subsetting functionality +# -------------------------------------------------------------------------------- +context("Cell Subsetting") + +test_that("WhichCells subsets properly", { + expect_equal(length(WhichCells(nbt.test, 1)), 3) + expect_equal(length(WhichCells(nbt.test, c(1,2))), 6) + expect_error(WhichCells(nbt.test, 10)) + expect_equal(WhichCells(nbt.test)[1], "Hi_GW21.2_3") + expect_equal(WhichCells(nbt.test, subset.name = "nGene", accept.high = 3000, accept.low = 2500), "Hi_GW16_23") + expect_equal(WhichCells(nbt.test, subset.name = "PC1", accept.high = 1.5, accept.low = 1.4), "Hi_GW21.2_3") + + expect_equal(length(WhichCells(nbt.test, max.cells.per.ident = 1)), length(unique(nbt.test@ident))) + expect_equal(length(WhichCells(nbt.test, c(1,2), max.cells.per.ident = 1)), 2) + expect_equal(length(WhichCells(nbt.test, subset.name = "nGene", max.cells.per.ident = 1)), length(unique(nbt.test@ident))) +}) + +test_that("SubsetData works properly", { + nbt.test@dr <- list() + count <- length(WhichCells(nbt.test, 1)) + nbt.test.subset <- SubsetData(nbt.test, ident.use = 1) + expect_equal(length(nbt.test.subset@ident), count) +}) + +# Test CCA procedure +# -------------------------------------------------------------------------------- +context("CCA Alignment") +scrambled.cells <- sample(nbt.test@cell.names) +c1 <- SubsetData(nbt.test, cells.use = scrambled.cells[1:7]) +c2 <- SubsetData(nbt.test, cells.use = scrambled.cells[8:14]) +c3 <- RunCCA(c1, c2, genes.use = c1@var.genes, num.cc = 3) + +test_that("CCA returns the expected cell.embeddings matrix values", { + expect_equal(nrow(c3@dr$cca@cell.embeddings), 14) + expect_equal(ncol(c3@dr$cca@cell.embeddings), 3) + expect_equal(abs(unname(c3@dr$cca@cell.embeddings[1,1])), 0.3108733, tolerance = 1e-6 ) + expect_equal(abs(unname(c3@dr$cca@cell.embeddings[14,3])), 0.6064297, tolerance = 1e-6 ) +}) +