xref: /aosp_15_r20/external/rappor/client/python/rappor.py (revision 2abb31345f6c95944768b5222a9a5ed3fc68cc00)
1*2abb3134SXin Li# Copyright 2014 Google Inc. All rights reserved.
2*2abb3134SXin Li#
3*2abb3134SXin Li# Licensed under the Apache License, Version 2.0 (the "License");
4*2abb3134SXin Li# you may not use this file except in compliance with the License.
5*2abb3134SXin Li# You may obtain a copy of the License at
6*2abb3134SXin Li#
7*2abb3134SXin Li#     http://www.apache.org/licenses/LICENSE-2.0
8*2abb3134SXin Li#
9*2abb3134SXin Li# Unless required by applicable law or agreed to in writing, software
10*2abb3134SXin Li# distributed under the License is distributed on an "AS IS" BASIS,
11*2abb3134SXin Li# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*2abb3134SXin Li# See the License for the specific language governing permissions and
13*2abb3134SXin Li# limitations under the License.
14*2abb3134SXin Li
15*2abb3134SXin Li"""RAPPOR client library.
16*2abb3134SXin Li
17*2abb3134SXin LiPrivacy is ensured without a third party by only sending RAPPOR'd data over the
18*2abb3134SXin Linetwork (as opposed to raw client data).
19*2abb3134SXin Li
20*2abb3134SXin LiNote that we use MD5 for the Bloom filter hash function. (security not required)
21*2abb3134SXin Li"""
22*2abb3134SXin Liimport csv
23*2abb3134SXin Liimport hashlib
24*2abb3134SXin Liimport hmac
25*2abb3134SXin Liimport json
26*2abb3134SXin Liimport struct
27*2abb3134SXin Liimport sys
28*2abb3134SXin Li
29*2abb3134SXin Lifrom random import SystemRandom
30*2abb3134SXin Li
31*2abb3134SXin Liclass Error(Exception):
32*2abb3134SXin Li  pass
33*2abb3134SXin Li
34*2abb3134SXin Li
35*2abb3134SXin Lidef log(msg, *args):
36*2abb3134SXin Li  if args:
37*2abb3134SXin Li    msg = msg % args
38*2abb3134SXin Li  print >>sys.stderr, msg
39*2abb3134SXin Li
40*2abb3134SXin Li
41*2abb3134SXin Liclass Params(object):
42*2abb3134SXin Li  """RAPPOR encoding parameters.
43*2abb3134SXin Li
44*2abb3134SXin Li  These affect privacy/anonymity.  See the paper for details.
45*2abb3134SXin Li  """
46*2abb3134SXin Li  def __init__(self):
47*2abb3134SXin Li    self.num_bloombits = 16      # Number of bloom filter bits (k)
48*2abb3134SXin Li    self.num_hashes = 2          # Number of bloom filter hashes (h)
49*2abb3134SXin Li    self.num_cohorts = 64        # Number of cohorts (m)
50*2abb3134SXin Li    self.prob_p = 0.50           # Probability p
51*2abb3134SXin Li    self.prob_q = 0.75           # Probability q
52*2abb3134SXin Li    self.prob_f = 0.50           # Probability f
53*2abb3134SXin Li
54*2abb3134SXin Li  # For testing
55*2abb3134SXin Li  def __eq__(self, other):
56*2abb3134SXin Li    return self.__dict__ == other.__dict__
57*2abb3134SXin Li
58*2abb3134SXin Li  def __repr__(self):
59*2abb3134SXin Li    return repr(self.__dict__)
60*2abb3134SXin Li
61*2abb3134SXin Li  def to_json(self):
62*2abb3134SXin Li    """Convert this instance to JSON.
63*2abb3134SXin Li
64*2abb3134SXin Li    The names are be compatible with the apps/api server.
65*2abb3134SXin Li    """
66*2abb3134SXin Li    return json.dumps({
67*2abb3134SXin Li        'numBits': self.num_bloombits,
68*2abb3134SXin Li        'numHashes': self.num_hashes,
69*2abb3134SXin Li        'numCohorts': self.num_cohorts,
70*2abb3134SXin Li        'probPrr': self.prob_f,
71*2abb3134SXin Li        'probIrr0': self.prob_p,
72*2abb3134SXin Li        'probIrr1': self.prob_q,
73*2abb3134SXin Li    })
74*2abb3134SXin Li
75*2abb3134SXin Li  # NOTE:
76*2abb3134SXin Li  # - from_csv is currently used in sum_bits.py
77*2abb3134SXin Li  # - to_csv is in rappor_sim.print_params
78*2abb3134SXin Li  @staticmethod
79*2abb3134SXin Li  def from_csv(f):
80*2abb3134SXin Li    """Read the RAPPOR parameters from a CSV file.
81*2abb3134SXin Li
82*2abb3134SXin Li    Args:
83*2abb3134SXin Li      f: file handle
84*2abb3134SXin Li
85*2abb3134SXin Li    Returns:
86*2abb3134SXin Li      Params instance.
87*2abb3134SXin Li
88*2abb3134SXin Li    Raises:
89*2abb3134SXin Li      rappor.Error: when the file is malformed.
90*2abb3134SXin Li    """
91*2abb3134SXin Li    c = csv.reader(f)
92*2abb3134SXin Li    ok = False
93*2abb3134SXin Li    p = Params()
94*2abb3134SXin Li    for i, row in enumerate(c):
95*2abb3134SXin Li
96*2abb3134SXin Li      if i == 0:
97*2abb3134SXin Li        if row != ['k', 'h', 'm', 'p', 'q', 'f']:
98*2abb3134SXin Li          raise Error('Header %s is malformed; expected k,h,m,p,q,f' % row)
99*2abb3134SXin Li
100*2abb3134SXin Li      elif i == 1:
101*2abb3134SXin Li        try:
102*2abb3134SXin Li          # NOTE: May raise exceptions
103*2abb3134SXin Li          p.num_bloombits = int(row[0])
104*2abb3134SXin Li          p.num_hashes = int(row[1])
105*2abb3134SXin Li          p.num_cohorts = int(row[2])
106*2abb3134SXin Li          p.prob_p = float(row[3])
107*2abb3134SXin Li          p.prob_q = float(row[4])
108*2abb3134SXin Li          p.prob_f = float(row[5])
109*2abb3134SXin Li        except (ValueError, IndexError) as e:
110*2abb3134SXin Li          raise Error('Row is malformed: %s' % e)
111*2abb3134SXin Li        ok = True
112*2abb3134SXin Li
113*2abb3134SXin Li      else:
114*2abb3134SXin Li        raise Error('Params file should only have two rows')
115*2abb3134SXin Li
116*2abb3134SXin Li    if not ok:
117*2abb3134SXin Li      raise Error("Expected second row with params")
118*2abb3134SXin Li
119*2abb3134SXin Li    return p
120*2abb3134SXin Li
121*2abb3134SXin Li
122*2abb3134SXin Liclass _SecureRandom(object):
123*2abb3134SXin Li  """Returns an integer where each bit has probability p of being 1."""
124*2abb3134SXin Li
125*2abb3134SXin Li  def __init__(self, prob_one, num_bits):
126*2abb3134SXin Li    self.prob_one = prob_one
127*2abb3134SXin Li    self.num_bits = num_bits
128*2abb3134SXin Li
129*2abb3134SXin Li  def __call__(self):
130*2abb3134SXin Li    p = self.prob_one
131*2abb3134SXin Li    rand = SystemRandom()
132*2abb3134SXin Li    r = 0
133*2abb3134SXin Li
134*2abb3134SXin Li    for i in xrange(self.num_bits):
135*2abb3134SXin Li      bit = rand.random() < p
136*2abb3134SXin Li      r |= (bit << i)  # using bool as int
137*2abb3134SXin Li    return r
138*2abb3134SXin Li
139*2abb3134SXin Li
140*2abb3134SXin Liclass SecureIrrRand(object):
141*2abb3134SXin Li  """Python's os.random()"""
142*2abb3134SXin Li
143*2abb3134SXin Li  def __init__(self, params):
144*2abb3134SXin Li    """
145*2abb3134SXin Li    Args:
146*2abb3134SXin Li      params: rappor.Params
147*2abb3134SXin Li    """
148*2abb3134SXin Li    num_bits = params.num_bloombits
149*2abb3134SXin Li    # IRR probabilities
150*2abb3134SXin Li
151*2abb3134SXin Li    self.p_gen = _SecureRandom(params.prob_p, num_bits)
152*2abb3134SXin Li    self.q_gen = _SecureRandom(params.prob_q, num_bits)
153*2abb3134SXin Li
154*2abb3134SXin Li
155*2abb3134SXin Lidef to_big_endian(i):
156*2abb3134SXin Li  """Convert an integer to a 4 byte big endian string.  Used for hashing."""
157*2abb3134SXin Li  # https://docs.python.org/2/library/struct.html
158*2abb3134SXin Li  # - Big Endian (>) for consistent network byte order.
159*2abb3134SXin Li  # - L means 4 bytes when using >
160*2abb3134SXin Li  return struct.pack('>L', i)
161*2abb3134SXin Li
162*2abb3134SXin Li
163*2abb3134SXin Lidef get_bloom_bits(word, cohort, num_hashes, num_bloombits):
164*2abb3134SXin Li  """Return an array of bits to set in the bloom filter.
165*2abb3134SXin Li
166*2abb3134SXin Li  In the real report, we bitwise-OR them together.  In hash candidates, we put
167*2abb3134SXin Li  them in separate entries in the "map" matrix.
168*2abb3134SXin Li  """
169*2abb3134SXin Li  value = to_big_endian(cohort) + word  # Cohort is 4 byte prefix.
170*2abb3134SXin Li  md5 = hashlib.md5(value)
171*2abb3134SXin Li
172*2abb3134SXin Li  digest = md5.digest()
173*2abb3134SXin Li
174*2abb3134SXin Li  # Each has is a byte, which means we could have up to 256 bit Bloom filters.
175*2abb3134SXin Li  # There are 16 bytes in an MD5, in which case we can have up to 16 hash
176*2abb3134SXin Li  # functions per Bloom filter.
177*2abb3134SXin Li  if num_hashes > len(digest):
178*2abb3134SXin Li    raise RuntimeError("Can't have more than %d hashes" % md5)
179*2abb3134SXin Li
180*2abb3134SXin Li  #log('hash_input %r', value)
181*2abb3134SXin Li  #log('Cohort %d', cohort)
182*2abb3134SXin Li  #log('MD5 %s', md5.hexdigest())
183*2abb3134SXin Li
184*2abb3134SXin Li  return [ord(digest[i]) % num_bloombits for i in xrange(num_hashes)]
185*2abb3134SXin Li
186*2abb3134SXin Li
187*2abb3134SXin Lidef get_prr_masks(secret, word, prob_f, num_bits):
188*2abb3134SXin Li  h = hmac.new(secret, word, digestmod=hashlib.sha256)
189*2abb3134SXin Li  #log('word %s, secret %s, HMAC-SHA256 %s', word, secret, h.hexdigest())
190*2abb3134SXin Li
191*2abb3134SXin Li  # Now go through each byte
192*2abb3134SXin Li  digest_bytes = h.digest()
193*2abb3134SXin Li  assert len(digest_bytes) == 32
194*2abb3134SXin Li
195*2abb3134SXin Li  # Use 32 bits.  If we want 64 bits, it may be fine to generate another 32
196*2abb3134SXin Li  # bytes by repeated HMAC.  For arbitrary numbers of bytes it's probably
197*2abb3134SXin Li  # better to use the HMAC-DRBG algorithm.
198*2abb3134SXin Li  if num_bits > len(digest_bytes):
199*2abb3134SXin Li    raise RuntimeError('%d bits is more than the max of %d', num_bits, len(d))
200*2abb3134SXin Li
201*2abb3134SXin Li  threshold128 = prob_f * 128
202*2abb3134SXin Li
203*2abb3134SXin Li  uniform = 0
204*2abb3134SXin Li  f_mask = 0
205*2abb3134SXin Li
206*2abb3134SXin Li  for i in xrange(num_bits):
207*2abb3134SXin Li    ch = digest_bytes[i]
208*2abb3134SXin Li    byte = ord(ch)
209*2abb3134SXin Li
210*2abb3134SXin Li    u_bit = byte & 0x01  # 1 bit of entropy
211*2abb3134SXin Li    uniform |= (u_bit << i)  # maybe set bit in mask
212*2abb3134SXin Li
213*2abb3134SXin Li    rand128 = byte >> 1  # 7 bits of entropy
214*2abb3134SXin Li    noise_bit = (rand128 < threshold128)
215*2abb3134SXin Li    f_mask |= (noise_bit << i)  # maybe set bit in mask
216*2abb3134SXin Li
217*2abb3134SXin Li  return uniform, f_mask
218*2abb3134SXin Li
219*2abb3134SXin Li
220*2abb3134SXin Lidef bit_string(irr, num_bloombits):
221*2abb3134SXin Li  """Like bin(), but uses leading zeroes, and no '0b'."""
222*2abb3134SXin Li  s = ''
223*2abb3134SXin Li  bits = []
224*2abb3134SXin Li  for bit_num in xrange(num_bloombits):
225*2abb3134SXin Li    if irr & (1 << bit_num):
226*2abb3134SXin Li      bits.append('1')
227*2abb3134SXin Li    else:
228*2abb3134SXin Li      bits.append('0')
229*2abb3134SXin Li  return ''.join(reversed(bits))
230*2abb3134SXin Li
231*2abb3134SXin Li
232*2abb3134SXin Liclass Encoder(object):
233*2abb3134SXin Li  """Obfuscates values for a given user using the RAPPOR privacy algorithm."""
234*2abb3134SXin Li
235*2abb3134SXin Li  def __init__(self, params, cohort, secret, irr_rand):
236*2abb3134SXin Li    """
237*2abb3134SXin Li    Args:
238*2abb3134SXin Li      params: RAPPOR Params() controlling privacy
239*2abb3134SXin Li      cohort: integer cohort, for Bloom hashing.
240*2abb3134SXin Li      secret: secret string, for the PRR to be a deterministic function of the
241*2abb3134SXin Li        reported value.
242*2abb3134SXin Li      irr_rand: IRR randomness interface.
243*2abb3134SXin Li    """
244*2abb3134SXin Li    # RAPPOR params.  NOTE: num_cohorts isn't used.  p and q are used by
245*2abb3134SXin Li    # irr_rand.
246*2abb3134SXin Li    self.params = params
247*2abb3134SXin Li    self.cohort = cohort  # associated: MD5
248*2abb3134SXin Li    self.secret = secret  # associated: HMAC-SHA256
249*2abb3134SXin Li    self.irr_rand = irr_rand  # p and q used
250*2abb3134SXin Li
251*2abb3134SXin Li  def _internal_encode_bits(self, bits):
252*2abb3134SXin Li    """Helper function for simulation / testing.
253*2abb3134SXin Li
254*2abb3134SXin Li    Returns:
255*2abb3134SXin Li      The PRR and IRR.  The PRR should never be sent over the network.
256*2abb3134SXin Li    """
257*2abb3134SXin Li    # Compute Permanent Randomized Response (PRR).
258*2abb3134SXin Li    uniform, f_mask = get_prr_masks(
259*2abb3134SXin Li        self.secret, to_big_endian(bits), self.params.prob_f,
260*2abb3134SXin Li        self.params.num_bloombits)
261*2abb3134SXin Li
262*2abb3134SXin Li    # Suppose bit i of the Bloom filter is B_i.  Then bit i of the PRR is
263*2abb3134SXin Li    # defined as:
264*2abb3134SXin Li    #
265*2abb3134SXin Li    # 1   with prob f/2
266*2abb3134SXin Li    # 0   with prob f/2
267*2abb3134SXin Li    # B_i with prob 1-f
268*2abb3134SXin Li
269*2abb3134SXin Li    # Uniform bits are 1 with probability 1/2, and f_mask bits are 1 with
270*2abb3134SXin Li    # probability f.  So in the expression below:
271*2abb3134SXin Li    #
272*2abb3134SXin Li    # - Bits in (uniform & f_mask) are 1 with probability f/2.
273*2abb3134SXin Li    # - (bloom_bits & ~f_mask) clears a bloom filter bit with probability
274*2abb3134SXin Li    # f, so we get B_i with probability 1-f.
275*2abb3134SXin Li    # - The remaining bits are 0, with remaining probability f/2.
276*2abb3134SXin Li
277*2abb3134SXin Li    prr = (bits & ~f_mask) | (uniform & f_mask)
278*2abb3134SXin Li
279*2abb3134SXin Li    #log('U %s / F %s', bit_string(uniform, num_bits),
280*2abb3134SXin Li    #    bit_string(f_mask, num_bits))
281*2abb3134SXin Li
282*2abb3134SXin Li    #log('B %s / PRR %s', bit_string(bloom_bits, num_bits),
283*2abb3134SXin Li    #    bit_string(prr, num_bits))
284*2abb3134SXin Li
285*2abb3134SXin Li    # Compute Instantaneous Randomized Response (IRR).
286*2abb3134SXin Li    # If PRR bit is 0, IRR bit is 1 with probability p.
287*2abb3134SXin Li    # If PRR bit is 1, IRR bit is 1 with probability q.
288*2abb3134SXin Li    p_bits = self.irr_rand.p_gen()
289*2abb3134SXin Li    q_bits = self.irr_rand.q_gen()
290*2abb3134SXin Li
291*2abb3134SXin Li    irr = (p_bits & ~prr) | (q_bits & prr)
292*2abb3134SXin Li
293*2abb3134SXin Li    return prr, irr  # IRR is the rappor
294*2abb3134SXin Li
295*2abb3134SXin Li  def _internal_encode(self, word):
296*2abb3134SXin Li    """Helper function for simulation / testing.
297*2abb3134SXin Li
298*2abb3134SXin Li    Returns:
299*2abb3134SXin Li      The Bloom filter bits, PRR, and IRR.  The first two values should never
300*2abb3134SXin Li      be sent over the network.
301*2abb3134SXin Li    """
302*2abb3134SXin Li    bloom_bits = get_bloom_bits(word, self.cohort, self.params.num_hashes,
303*2abb3134SXin Li                                self.params.num_bloombits)
304*2abb3134SXin Li
305*2abb3134SXin Li    bloom = 0
306*2abb3134SXin Li    for bit_to_set in bloom_bits:
307*2abb3134SXin Li      bloom |= (1 << bit_to_set)
308*2abb3134SXin Li
309*2abb3134SXin Li    prr, irr = self._internal_encode_bits(bloom)
310*2abb3134SXin Li    return bloom, prr, irr
311*2abb3134SXin Li
312*2abb3134SXin Li  def encode_bits(self, bits):
313*2abb3134SXin Li    """Encode a string with RAPPOR.
314*2abb3134SXin Li
315*2abb3134SXin Li    Args:
316*2abb3134SXin Li      bits: An integer representing bits to encode.
317*2abb3134SXin Li
318*2abb3134SXin Li    Returns:
319*2abb3134SXin Li      An integer that is the IRR (Instantaneous Randomized Response).
320*2abb3134SXin Li    """
321*2abb3134SXin Li    _, irr = self._internal_encode_bits(bits)
322*2abb3134SXin Li    return irr
323*2abb3134SXin Li
324*2abb3134SXin Li  def encode(self, word):
325*2abb3134SXin Li    """Encode a string with RAPPOR.
326*2abb3134SXin Li
327*2abb3134SXin Li    Args:
328*2abb3134SXin Li      word: the string that should be privately transmitted.
329*2abb3134SXin Li
330*2abb3134SXin Li    Returns:
331*2abb3134SXin Li      An integer that is the IRR (Instantaneous Randomized Response).
332*2abb3134SXin Li    """
333*2abb3134SXin Li    _, _, irr = self._internal_encode(word)
334*2abb3134SXin Li    return irr
335