1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved. 2*2abb3134SXin Li# 3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License"); 4*2abb3134SXin Li# you may not use this file except in compliance with the License. 5*2abb3134SXin Li# You may obtain a copy of the License at 6*2abb3134SXin Li# 7*2abb3134SXin Li# http://www.apache.org/licenses/LICENSE-2.0 8*2abb3134SXin Li# 9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software 10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS, 11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*2abb3134SXin Li# See the License for the specific language governing permissions and 13*2abb3134SXin Li# limitations under the License. 14*2abb3134SXin Li 15*2abb3134SXin Li"""RAPPOR client library. 16*2abb3134SXin Li 17*2abb3134SXin LiPrivacy is ensured without a third party by only sending RAPPOR'd data over the 18*2abb3134SXin Linetwork (as opposed to raw client data). 19*2abb3134SXin Li 20*2abb3134SXin LiNote that we use MD5 for the Bloom filter hash function. (security not required) 21*2abb3134SXin Li""" 22*2abb3134SXin Liimport csv 23*2abb3134SXin Liimport hashlib 24*2abb3134SXin Liimport hmac 25*2abb3134SXin Liimport json 26*2abb3134SXin Liimport struct 27*2abb3134SXin Liimport sys 28*2abb3134SXin Li 29*2abb3134SXin Lifrom random import SystemRandom 30*2abb3134SXin Li 31*2abb3134SXin Liclass Error(Exception): 32*2abb3134SXin Li pass 33*2abb3134SXin Li 34*2abb3134SXin Li 35*2abb3134SXin Lidef log(msg, *args): 36*2abb3134SXin Li if args: 37*2abb3134SXin Li msg = msg % args 38*2abb3134SXin Li print >>sys.stderr, msg 39*2abb3134SXin Li 40*2abb3134SXin Li 41*2abb3134SXin Liclass Params(object): 42*2abb3134SXin Li """RAPPOR encoding parameters. 43*2abb3134SXin Li 44*2abb3134SXin Li These affect privacy/anonymity. See the paper for details. 45*2abb3134SXin Li """ 46*2abb3134SXin Li def __init__(self): 47*2abb3134SXin Li self.num_bloombits = 16 # Number of bloom filter bits (k) 48*2abb3134SXin Li self.num_hashes = 2 # Number of bloom filter hashes (h) 49*2abb3134SXin Li self.num_cohorts = 64 # Number of cohorts (m) 50*2abb3134SXin Li self.prob_p = 0.50 # Probability p 51*2abb3134SXin Li self.prob_q = 0.75 # Probability q 52*2abb3134SXin Li self.prob_f = 0.50 # Probability f 53*2abb3134SXin Li 54*2abb3134SXin Li # For testing 55*2abb3134SXin Li def __eq__(self, other): 56*2abb3134SXin Li return self.__dict__ == other.__dict__ 57*2abb3134SXin Li 58*2abb3134SXin Li def __repr__(self): 59*2abb3134SXin Li return repr(self.__dict__) 60*2abb3134SXin Li 61*2abb3134SXin Li def to_json(self): 62*2abb3134SXin Li """Convert this instance to JSON. 63*2abb3134SXin Li 64*2abb3134SXin Li The names are be compatible with the apps/api server. 65*2abb3134SXin Li """ 66*2abb3134SXin Li return json.dumps({ 67*2abb3134SXin Li 'numBits': self.num_bloombits, 68*2abb3134SXin Li 'numHashes': self.num_hashes, 69*2abb3134SXin Li 'numCohorts': self.num_cohorts, 70*2abb3134SXin Li 'probPrr': self.prob_f, 71*2abb3134SXin Li 'probIrr0': self.prob_p, 72*2abb3134SXin Li 'probIrr1': self.prob_q, 73*2abb3134SXin Li }) 74*2abb3134SXin Li 75*2abb3134SXin Li # NOTE: 76*2abb3134SXin Li # - from_csv is currently used in sum_bits.py 77*2abb3134SXin Li # - to_csv is in rappor_sim.print_params 78*2abb3134SXin Li @staticmethod 79*2abb3134SXin Li def from_csv(f): 80*2abb3134SXin Li """Read the RAPPOR parameters from a CSV file. 81*2abb3134SXin Li 82*2abb3134SXin Li Args: 83*2abb3134SXin Li f: file handle 84*2abb3134SXin Li 85*2abb3134SXin Li Returns: 86*2abb3134SXin Li Params instance. 87*2abb3134SXin Li 88*2abb3134SXin Li Raises: 89*2abb3134SXin Li rappor.Error: when the file is malformed. 90*2abb3134SXin Li """ 91*2abb3134SXin Li c = csv.reader(f) 92*2abb3134SXin Li ok = False 93*2abb3134SXin Li p = Params() 94*2abb3134SXin Li for i, row in enumerate(c): 95*2abb3134SXin Li 96*2abb3134SXin Li if i == 0: 97*2abb3134SXin Li if row != ['k', 'h', 'm', 'p', 'q', 'f']: 98*2abb3134SXin Li raise Error('Header %s is malformed; expected k,h,m,p,q,f' % row) 99*2abb3134SXin Li 100*2abb3134SXin Li elif i == 1: 101*2abb3134SXin Li try: 102*2abb3134SXin Li # NOTE: May raise exceptions 103*2abb3134SXin Li p.num_bloombits = int(row[0]) 104*2abb3134SXin Li p.num_hashes = int(row[1]) 105*2abb3134SXin Li p.num_cohorts = int(row[2]) 106*2abb3134SXin Li p.prob_p = float(row[3]) 107*2abb3134SXin Li p.prob_q = float(row[4]) 108*2abb3134SXin Li p.prob_f = float(row[5]) 109*2abb3134SXin Li except (ValueError, IndexError) as e: 110*2abb3134SXin Li raise Error('Row is malformed: %s' % e) 111*2abb3134SXin Li ok = True 112*2abb3134SXin Li 113*2abb3134SXin Li else: 114*2abb3134SXin Li raise Error('Params file should only have two rows') 115*2abb3134SXin Li 116*2abb3134SXin Li if not ok: 117*2abb3134SXin Li raise Error("Expected second row with params") 118*2abb3134SXin Li 119*2abb3134SXin Li return p 120*2abb3134SXin Li 121*2abb3134SXin Li 122*2abb3134SXin Liclass _SecureRandom(object): 123*2abb3134SXin Li """Returns an integer where each bit has probability p of being 1.""" 124*2abb3134SXin Li 125*2abb3134SXin Li def __init__(self, prob_one, num_bits): 126*2abb3134SXin Li self.prob_one = prob_one 127*2abb3134SXin Li self.num_bits = num_bits 128*2abb3134SXin Li 129*2abb3134SXin Li def __call__(self): 130*2abb3134SXin Li p = self.prob_one 131*2abb3134SXin Li rand = SystemRandom() 132*2abb3134SXin Li r = 0 133*2abb3134SXin Li 134*2abb3134SXin Li for i in xrange(self.num_bits): 135*2abb3134SXin Li bit = rand.random() < p 136*2abb3134SXin Li r |= (bit << i) # using bool as int 137*2abb3134SXin Li return r 138*2abb3134SXin Li 139*2abb3134SXin Li 140*2abb3134SXin Liclass SecureIrrRand(object): 141*2abb3134SXin Li """Python's os.random()""" 142*2abb3134SXin Li 143*2abb3134SXin Li def __init__(self, params): 144*2abb3134SXin Li """ 145*2abb3134SXin Li Args: 146*2abb3134SXin Li params: rappor.Params 147*2abb3134SXin Li """ 148*2abb3134SXin Li num_bits = params.num_bloombits 149*2abb3134SXin Li # IRR probabilities 150*2abb3134SXin Li 151*2abb3134SXin Li self.p_gen = _SecureRandom(params.prob_p, num_bits) 152*2abb3134SXin Li self.q_gen = _SecureRandom(params.prob_q, num_bits) 153*2abb3134SXin Li 154*2abb3134SXin Li 155*2abb3134SXin Lidef to_big_endian(i): 156*2abb3134SXin Li """Convert an integer to a 4 byte big endian string. Used for hashing.""" 157*2abb3134SXin Li # https://docs.python.org/2/library/struct.html 158*2abb3134SXin Li # - Big Endian (>) for consistent network byte order. 159*2abb3134SXin Li # - L means 4 bytes when using > 160*2abb3134SXin Li return struct.pack('>L', i) 161*2abb3134SXin Li 162*2abb3134SXin Li 163*2abb3134SXin Lidef get_bloom_bits(word, cohort, num_hashes, num_bloombits): 164*2abb3134SXin Li """Return an array of bits to set in the bloom filter. 165*2abb3134SXin Li 166*2abb3134SXin Li In the real report, we bitwise-OR them together. In hash candidates, we put 167*2abb3134SXin Li them in separate entries in the "map" matrix. 168*2abb3134SXin Li """ 169*2abb3134SXin Li value = to_big_endian(cohort) + word # Cohort is 4 byte prefix. 170*2abb3134SXin Li md5 = hashlib.md5(value) 171*2abb3134SXin Li 172*2abb3134SXin Li digest = md5.digest() 173*2abb3134SXin Li 174*2abb3134SXin Li # Each has is a byte, which means we could have up to 256 bit Bloom filters. 175*2abb3134SXin Li # There are 16 bytes in an MD5, in which case we can have up to 16 hash 176*2abb3134SXin Li # functions per Bloom filter. 177*2abb3134SXin Li if num_hashes > len(digest): 178*2abb3134SXin Li raise RuntimeError("Can't have more than %d hashes" % md5) 179*2abb3134SXin Li 180*2abb3134SXin Li #log('hash_input %r', value) 181*2abb3134SXin Li #log('Cohort %d', cohort) 182*2abb3134SXin Li #log('MD5 %s', md5.hexdigest()) 183*2abb3134SXin Li 184*2abb3134SXin Li return [ord(digest[i]) % num_bloombits for i in xrange(num_hashes)] 185*2abb3134SXin Li 186*2abb3134SXin Li 187*2abb3134SXin Lidef get_prr_masks(secret, word, prob_f, num_bits): 188*2abb3134SXin Li h = hmac.new(secret, word, digestmod=hashlib.sha256) 189*2abb3134SXin Li #log('word %s, secret %s, HMAC-SHA256 %s', word, secret, h.hexdigest()) 190*2abb3134SXin Li 191*2abb3134SXin Li # Now go through each byte 192*2abb3134SXin Li digest_bytes = h.digest() 193*2abb3134SXin Li assert len(digest_bytes) == 32 194*2abb3134SXin Li 195*2abb3134SXin Li # Use 32 bits. If we want 64 bits, it may be fine to generate another 32 196*2abb3134SXin Li # bytes by repeated HMAC. For arbitrary numbers of bytes it's probably 197*2abb3134SXin Li # better to use the HMAC-DRBG algorithm. 198*2abb3134SXin Li if num_bits > len(digest_bytes): 199*2abb3134SXin Li raise RuntimeError('%d bits is more than the max of %d', num_bits, len(d)) 200*2abb3134SXin Li 201*2abb3134SXin Li threshold128 = prob_f * 128 202*2abb3134SXin Li 203*2abb3134SXin Li uniform = 0 204*2abb3134SXin Li f_mask = 0 205*2abb3134SXin Li 206*2abb3134SXin Li for i in xrange(num_bits): 207*2abb3134SXin Li ch = digest_bytes[i] 208*2abb3134SXin Li byte = ord(ch) 209*2abb3134SXin Li 210*2abb3134SXin Li u_bit = byte & 0x01 # 1 bit of entropy 211*2abb3134SXin Li uniform |= (u_bit << i) # maybe set bit in mask 212*2abb3134SXin Li 213*2abb3134SXin Li rand128 = byte >> 1 # 7 bits of entropy 214*2abb3134SXin Li noise_bit = (rand128 < threshold128) 215*2abb3134SXin Li f_mask |= (noise_bit << i) # maybe set bit in mask 216*2abb3134SXin Li 217*2abb3134SXin Li return uniform, f_mask 218*2abb3134SXin Li 219*2abb3134SXin Li 220*2abb3134SXin Lidef bit_string(irr, num_bloombits): 221*2abb3134SXin Li """Like bin(), but uses leading zeroes, and no '0b'.""" 222*2abb3134SXin Li s = '' 223*2abb3134SXin Li bits = [] 224*2abb3134SXin Li for bit_num in xrange(num_bloombits): 225*2abb3134SXin Li if irr & (1 << bit_num): 226*2abb3134SXin Li bits.append('1') 227*2abb3134SXin Li else: 228*2abb3134SXin Li bits.append('0') 229*2abb3134SXin Li return ''.join(reversed(bits)) 230*2abb3134SXin Li 231*2abb3134SXin Li 232*2abb3134SXin Liclass Encoder(object): 233*2abb3134SXin Li """Obfuscates values for a given user using the RAPPOR privacy algorithm.""" 234*2abb3134SXin Li 235*2abb3134SXin Li def __init__(self, params, cohort, secret, irr_rand): 236*2abb3134SXin Li """ 237*2abb3134SXin Li Args: 238*2abb3134SXin Li params: RAPPOR Params() controlling privacy 239*2abb3134SXin Li cohort: integer cohort, for Bloom hashing. 240*2abb3134SXin Li secret: secret string, for the PRR to be a deterministic function of the 241*2abb3134SXin Li reported value. 242*2abb3134SXin Li irr_rand: IRR randomness interface. 243*2abb3134SXin Li """ 244*2abb3134SXin Li # RAPPOR params. NOTE: num_cohorts isn't used. p and q are used by 245*2abb3134SXin Li # irr_rand. 246*2abb3134SXin Li self.params = params 247*2abb3134SXin Li self.cohort = cohort # associated: MD5 248*2abb3134SXin Li self.secret = secret # associated: HMAC-SHA256 249*2abb3134SXin Li self.irr_rand = irr_rand # p and q used 250*2abb3134SXin Li 251*2abb3134SXin Li def _internal_encode_bits(self, bits): 252*2abb3134SXin Li """Helper function for simulation / testing. 253*2abb3134SXin Li 254*2abb3134SXin Li Returns: 255*2abb3134SXin Li The PRR and IRR. The PRR should never be sent over the network. 256*2abb3134SXin Li """ 257*2abb3134SXin Li # Compute Permanent Randomized Response (PRR). 258*2abb3134SXin Li uniform, f_mask = get_prr_masks( 259*2abb3134SXin Li self.secret, to_big_endian(bits), self.params.prob_f, 260*2abb3134SXin Li self.params.num_bloombits) 261*2abb3134SXin Li 262*2abb3134SXin Li # Suppose bit i of the Bloom filter is B_i. Then bit i of the PRR is 263*2abb3134SXin Li # defined as: 264*2abb3134SXin Li # 265*2abb3134SXin Li # 1 with prob f/2 266*2abb3134SXin Li # 0 with prob f/2 267*2abb3134SXin Li # B_i with prob 1-f 268*2abb3134SXin Li 269*2abb3134SXin Li # Uniform bits are 1 with probability 1/2, and f_mask bits are 1 with 270*2abb3134SXin Li # probability f. So in the expression below: 271*2abb3134SXin Li # 272*2abb3134SXin Li # - Bits in (uniform & f_mask) are 1 with probability f/2. 273*2abb3134SXin Li # - (bloom_bits & ~f_mask) clears a bloom filter bit with probability 274*2abb3134SXin Li # f, so we get B_i with probability 1-f. 275*2abb3134SXin Li # - The remaining bits are 0, with remaining probability f/2. 276*2abb3134SXin Li 277*2abb3134SXin Li prr = (bits & ~f_mask) | (uniform & f_mask) 278*2abb3134SXin Li 279*2abb3134SXin Li #log('U %s / F %s', bit_string(uniform, num_bits), 280*2abb3134SXin Li # bit_string(f_mask, num_bits)) 281*2abb3134SXin Li 282*2abb3134SXin Li #log('B %s / PRR %s', bit_string(bloom_bits, num_bits), 283*2abb3134SXin Li # bit_string(prr, num_bits)) 284*2abb3134SXin Li 285*2abb3134SXin Li # Compute Instantaneous Randomized Response (IRR). 286*2abb3134SXin Li # If PRR bit is 0, IRR bit is 1 with probability p. 287*2abb3134SXin Li # If PRR bit is 1, IRR bit is 1 with probability q. 288*2abb3134SXin Li p_bits = self.irr_rand.p_gen() 289*2abb3134SXin Li q_bits = self.irr_rand.q_gen() 290*2abb3134SXin Li 291*2abb3134SXin Li irr = (p_bits & ~prr) | (q_bits & prr) 292*2abb3134SXin Li 293*2abb3134SXin Li return prr, irr # IRR is the rappor 294*2abb3134SXin Li 295*2abb3134SXin Li def _internal_encode(self, word): 296*2abb3134SXin Li """Helper function for simulation / testing. 297*2abb3134SXin Li 298*2abb3134SXin Li Returns: 299*2abb3134SXin Li The Bloom filter bits, PRR, and IRR. The first two values should never 300*2abb3134SXin Li be sent over the network. 301*2abb3134SXin Li """ 302*2abb3134SXin Li bloom_bits = get_bloom_bits(word, self.cohort, self.params.num_hashes, 303*2abb3134SXin Li self.params.num_bloombits) 304*2abb3134SXin Li 305*2abb3134SXin Li bloom = 0 306*2abb3134SXin Li for bit_to_set in bloom_bits: 307*2abb3134SXin Li bloom |= (1 << bit_to_set) 308*2abb3134SXin Li 309*2abb3134SXin Li prr, irr = self._internal_encode_bits(bloom) 310*2abb3134SXin Li return bloom, prr, irr 311*2abb3134SXin Li 312*2abb3134SXin Li def encode_bits(self, bits): 313*2abb3134SXin Li """Encode a string with RAPPOR. 314*2abb3134SXin Li 315*2abb3134SXin Li Args: 316*2abb3134SXin Li bits: An integer representing bits to encode. 317*2abb3134SXin Li 318*2abb3134SXin Li Returns: 319*2abb3134SXin Li An integer that is the IRR (Instantaneous Randomized Response). 320*2abb3134SXin Li """ 321*2abb3134SXin Li _, irr = self._internal_encode_bits(bits) 322*2abb3134SXin Li return irr 323*2abb3134SXin Li 324*2abb3134SXin Li def encode(self, word): 325*2abb3134SXin Li """Encode a string with RAPPOR. 326*2abb3134SXin Li 327*2abb3134SXin Li Args: 328*2abb3134SXin Li word: the string that should be privately transmitted. 329*2abb3134SXin Li 330*2abb3134SXin Li Returns: 331*2abb3134SXin Li An integer that is the IRR (Instantaneous Randomized Response). 332*2abb3134SXin Li """ 333*2abb3134SXin Li _, _, irr = self._internal_encode(word) 334*2abb3134SXin Li return irr 335