Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rust: Sensitive data library #18414

Merged
merged 5 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion config/identical-files.json
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,8 @@
"javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll",
"python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll",
"ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll",
"swift/ql/lib/codeql/swift/security/internal/SensitiveDataHeuristics.qll"
"swift/ql/lib/codeql/swift/security/internal/SensitiveDataHeuristics.qll",
"rust/ql/lib/codeql/rust/security/internal/SensitiveDataHeuristics.qll"
],
"IncompleteUrlSubstringSanitization": [
"javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qll",
Expand Down
86 changes: 86 additions & 0 deletions rust/ql/lib/codeql/rust/security/SensitiveData.qll
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/**
* Provides classes and predicates for identifying sensitive data.
*
* 'Sensitive' data is anything that should not be sent in unencrypted form. This library tries to
* guess where sensitive data may either be stored in a variable or produced by a method.
*/

import rust
private import internal.SensitiveDataHeuristics
private import codeql.rust.dataflow.DataFlow

/**
* A data flow node that might contain sensitive data.
paldepind marked this conversation as resolved.
Show resolved Hide resolved
*/
cached
abstract class SensitiveData extends DataFlow::Node {
/**
* Gets a classification of the kind of sensitive data this expression might contain.
*/
cached
abstract SensitiveDataClassification getClassification();
}

/**
* A function that might produce sensitive data.
*/
private class SensitiveDataFunction extends Function {
SensitiveDataClassification classification;

SensitiveDataFunction() {
HeuristicNames::nameIndicatesSensitiveData(this.getName().getText(), classification)
}

SensitiveDataClassification getClassification() { result = classification }
}

/**
* A function call that might produce sensitive data.
*/
private class SensitiveDataCall extends SensitiveData {
SensitiveDataClassification classification;

SensitiveDataCall() {
classification =
this.asExpr()
.getAstNode()
.(CallExprBase)
.getStaticTarget()
.(SensitiveDataFunction)
.getClassification()
}

override SensitiveDataClassification getClassification() { result = classification }
}

/**
* A variable that might contain sensitive data.
*/
private class SensitiveDataVariable extends Variable {
SensitiveDataClassification classification;

SensitiveDataVariable() {
HeuristicNames::nameIndicatesSensitiveData(this.getName(), classification)
}

SensitiveDataClassification getClassification() { result = classification }
}

/**
* A variable access that might produce sensitive data.
*/
private class SensitiveVariableAccess extends SensitiveData {
SensitiveDataClassification classification;

SensitiveVariableAccess() {
classification =
this.asExpr()
.getAstNode()
.(VariableAccess)
.getVariable()
.(SensitiveDataVariable)
.getClassification()
}

override SensitiveDataClassification getClassification() { result = classification }
}
188 changes: 188 additions & 0 deletions rust/ql/lib/codeql/rust/security/internal/SensitiveDataHeuristics.qll
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
/**
* INTERNAL: Do not use.
*
* Provides classes and predicates for identifying strings that may indicate the presence of sensitive data.
* Such that we can share this logic across our CodeQL analysis of different languages.
*
* 'Sensitive' data in general is anything that should not be sent around in unencrypted form.
*/

/**
* A classification of different kinds of sensitive data:
*
* - secret: generic secret or trusted data;
* - id: a user name or other account information;
* - password: a password or authorization key;
* - certificate: a certificate.
* - private: private data such as credit card numbers
*
* While classifications are represented as strings, this should not be relied upon.
* Instead, use the predicates in `SensitiveDataClassification::` to work with
* classifications.
*/
class SensitiveDataClassification extends string {
SensitiveDataClassification() { this in ["secret", "id", "password", "certificate", "private"] }
}

/**
* Provides predicates to select the different kinds of sensitive data we support.
*/
module SensitiveDataClassification {
/** Gets the classification for secret or trusted data. */
SensitiveDataClassification secret() { result = "secret" }

/** Gets the classification for user names or other account information. */
SensitiveDataClassification id() { result = "id" }

/** Gets the classification for passwords or authorization keys. */
SensitiveDataClassification password() { result = "password" }

/** Gets the classification for certificates. */
SensitiveDataClassification certificate() { result = "certificate" }

/** Gets the classification for private data. */
SensitiveDataClassification private() { result = "private" }
}

/**
* INTERNAL: Do not use.
*
* Provides heuristics for identifying names related to sensitive information.
*/
module HeuristicNames {
/**
* Gets a regular expression that identifies strings that may indicate the presence of secret
* or trusted data.
*/
string maybeSecret() { result = "(?is).*((?<!is|is_)secret|(?<!un|un_|is|is_)trusted).*" }

/**
* Gets a regular expression that identifies strings that may indicate the presence of
* user names or other account information.
*/
string maybeAccountInfo() {
result = "(?is).*acc(ou)?nt.*" or
result = "(?is).*(puid|username|userid|session(id|key)).*" or
result = "(?s).*([uU]|^|_|[a-z](?=U))([uU][iI][dD]).*"
}

/**
* Gets a regular expression that identifies strings that may indicate the presence of
* a password or an authorization key.
*/
string maybePassword() {
result = "(?is).*pass(wd|word|code|phrase)(?!.*question).*" or
result = "(?is).*(auth(entication|ori[sz]ation)?)key.*"
}

/**
* Gets a regular expression that identifies strings that may indicate the presence of
* a certificate.
*/
string maybeCertificate() { result = "(?is).*(cert)(?!.*(format|name|ification)).*" }

/**
* Gets a regular expression that identifies strings that may indicate the presence of
* private data.
*/
string maybePrivate() {
result =
"(?is).*(" +
// Inspired by the list on https://cwe.mitre.org/data/definitions/359.html
// Government identifiers, such as Social Security Numbers
"social.?security|employer.?identification|national.?insurance|resident.?id|" +
"passport.?(num|no)|([_-]|\\b)ssn([_-]|\\b)|" +
// Contact information, such as home addresses
"post.?code|zip.?code|home.?addr|" +
// and telephone numbers
"(mob(ile)?|home).?(num|no|tel|phone)|(tel|fax|phone).?(num|no)|telephone|" +
"emergency.?contact|" +
// Geographic location - where the user is (or was)
"latitude|longitude|nationality|" +
// Financial data - such as credit card numbers, salary, bank accounts, and debts
"(credit|debit|bank|visa).?(card|num|no|acc(ou)?nt)|acc(ou)?nt.?(no|num|credit)|" +
"salary|billing|credit.?(rating|score)|([_-]|\\b)ccn([_-]|\\b)|" +
// Communications - e-mail addresses, private e-mail messages, SMS text messages, chat logs, etc.
// "e(mail|_mail)|" + // this seems too noisy
// Health - medical conditions, insurance status, prescription records
"birth.?da(te|y)|da(te|y).?(of.?)?birth|" +
"medical|(health|care).?plan|healthkit|appointment|prescription|" +
"blood.?(type|alcohol|glucose|pressure)|heart.?(rate|rhythm)|body.?(mass|fat)|" +
"menstrua|pregnan|insulin|inhaler|" +
// Relationships - work and family
"employ(er|ee)|spouse|maiden.?name" +
// ---
").*"
}

/**
* Gets a regular expression that identifies strings that may indicate the presence
* of sensitive data, with `classification` describing the kind of sensitive data involved.
*/
string maybeSensitiveRegexp(SensitiveDataClassification classification) {
result = maybeSecret() and classification = SensitiveDataClassification::secret()
or
result = maybeAccountInfo() and classification = SensitiveDataClassification::id()
or
result = maybePassword() and classification = SensitiveDataClassification::password()
or
result = maybeCertificate() and
classification = SensitiveDataClassification::certificate()
or
result = maybePrivate() and
classification = SensitiveDataClassification::private()
}

/**
* Gets a regular expression that identifies strings that may indicate the presence of data
* that is hashed or encrypted, and hence rendered non-sensitive, or contains special characters
* suggesting nouns within the string do not represent the meaning of the whole string (e.g. a URL or a SQL query).
*
* We also filter out common words like `certain` and `concert`, since otherwise these could
* be matched by the certificate regular expressions. Same for `accountable` (account), or
* `secretarial` (secret).
*/
string notSensitiveRegexp() {
result =
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|(?<!pass)code)|certain|concert|secretar|accountant|accountab).*"
}

/**
* Holds if `name` may indicate the presence of sensitive data, and `name` does not indicate that
* the data is in fact non-sensitive (for example since it is hashed or encrypted).
*
* That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the given
* classification), and none of the regexps from `notSensitiveRegexp` matches `name`.
*/
bindingset[name]
predicate nameIndicatesSensitiveData(string name) {
exists(string combinedRegexp |
// Combine all the maybe-sensitive regexps into one using non-capturing groups and |.
combinedRegexp =
"(?:" + strictconcat(string r | r = maybeSensitiveRegexp(_) | r, ")|(?:") + ")"
|
name.regexpMatch(combinedRegexp)
) and
not name.regexpMatch(notSensitiveRegexp())
}

/**
* Holds if `name` may indicate the presence of sensitive data, and
* `name` does not indicate that the data is in fact non-sensitive (for example since
* it is hashed or encrypted). `classification` describes the kind of sensitive data
* involved.
*
* That is, one of the regexps from `maybeSensitiveRegexp` matches `name` (with the
* given classification), and none of the regexps from `notSensitiveRegexp` matches
* `name`.
*
* When the set of names is large, it's worth using `nameIndicatesSensitiveData/1` as a first
* pass, since that combines all the regexps into one, and should be faster. Then call this
* predicate to get the classification(s).
*/
bindingset[name]
predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
name.regexpMatch(maybeSensitiveRegexp(classification)) and
not name.regexpMatch(notSensitiveRegexp())
}
}
15 changes: 15 additions & 0 deletions rust/ql/src/queries/summary/SensitiveData.ql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* @name Sensitive Data
* @description List all sensitive data found in the database. Sensitive data is anything that
* should not be sent in unencrypted form.
* @kind problem
* @problem.severity info
* @id rust/summary/sensitive-data
* @tags summary
*/

import rust
import codeql.rust.security.SensitiveData

from SensitiveData d
select d, "Sensitive data (" + d.getClassification() + "): " + d.toString()
3 changes: 3 additions & 0 deletions rust/ql/src/queries/summary/SummaryStats.ql
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import rust
import codeql.rust.Concepts
import codeql.rust.security.SensitiveData
import codeql.rust.Diagnostics
import Stats

Expand Down Expand Up @@ -56,4 +57,6 @@ where
key = "Taint sources - total" and value = count(ThreatModelSource s)
or
key = "Taint sources - active" and value = count(ActiveThreatModelSource s)
or
key = "Sensitive data" and value = count(SensitiveData d)
select key, value order by key
Empty file.
36 changes: 36 additions & 0 deletions rust/ql/test/library-tests/sensitivedata/SensitiveData.ql
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import rust
import codeql.rust.dataflow.DataFlow
import codeql.rust.dataflow.TaintTracking
import codeql.rust.security.SensitiveData
import utils.test.InlineExpectationsTest

/**
* Configuration for flow from any sensitive data source to an argument of the function `sink`.
*/
module SensitiveDataConfig implements DataFlow::ConfigSig {
predicate isSource(DataFlow::Node source) { source instanceof SensitiveData }

predicate isSink(DataFlow::Node sink) {
any(CallExpr call | call.getFunction().(PathExpr).getResolvedPath() = "crate::test::sink")
.getArgList()
.getAnArg() = sink.asExpr().getExpr()
}
}

module SensitiveDataFlow = TaintTracking::Global<SensitiveDataConfig>;

module SensitiveDataTest implements TestSig {
string getARelevantTag() { result = "sensitive" }

predicate hasActualResult(Location location, string element, string tag, string value) {
exists(DataFlow::Node source, DataFlow::Node sink |
SensitiveDataFlow::flow(source, sink) and
location = sink.getLocation() and
element = sink.toString() and
tag = "sensitive" and
value = source.(SensitiveData).getClassification()
)
}
}

import MakeTest<SensitiveDataTest>
Loading
Loading