|
The output set can be very large
|
in an input stream using a one-pass algorithm (i.e., without storing all the values in the stream)
The natural step is to relax the condition:
|
|
|
|
|
Warning:
|
|
|
|
|
/* -----------------------------------------------------
Initialization:
----------------------------------------------------- */
D = empty; // Empty list...
bcurrent = 1; // First current bucket
N = 0; // Number of items processed
/* ---------------------------------------------------
Main processing loop
--------------------------------------------------- */
while ( not end of stream ) do
{
x = next item in stream;
N = N + 1; // One more item processed
/* --------------------------------
Insert phase
-------------------------------- */
if ( x ∈ D )
{
fx++; // Increase its count
}
else
{
insert (x, 1, bcurrent-1) into D;
// Add x to D with frequency count = 1
// The maximum error Δ is set to (bcurrent- 1)
}
/* -----------------------------------------------------------
Delete phase: Space Reduction step...
Note: this step is executed once every w insertions
I.e., when one bucket fills up !
----------------------------------------------------------- */
if ( N mod w == 0 )
{ // Bucket boundary reached, cleanup the infrequent items !!
for ( each element i ∈ D ) do
{
if ( fi + &Deltai ≤ bcurrent )
delete (i, fi, &Deltai) from D;
}
bcurrent++; // Start a new bucket...
}
}
/* ---------------------------------------------------
Output phase
--------------------------------------------------- */
for ( each element i ∈ D ) do
{
if ( fi ≥ (s - ε) × N )
{
Print i, fi
}
}
|
(The parameter s is not used until the end of the algorithm) ε = 0.2 w = 1/ε= 5 (5 items per "bucket") |
|
|
|
|
|
Proof: by induction
|
|
Induction hypothesis: assume the statement is true for all i < k
|
We need to prove that:
|
Proof:
|
|
Proof:
|
|
That's what we needed to prove...
|
|
The following lemma tells us how accurate the approximate frequency f is.
|
Proof:
|
Part 2: fe ≤ f + &epsilon × N
|
where s is a set of items.
D = empty; // Empty list...
bcurrent = 1;
N = 0; // Number of items processed
while (NOT EOF) do
{
x = next item SET in stream;
N = N + 1; // One more item SET processed
// Tally step...
if ( x ∈ D )
{
for each (si, fi, Δi) ∈ D) do
{
if ( si ⊂ x )
fi++; // Found, increase its count
}
}
else
{
insert (x, 1, bcurrent-1) into D;
// New element has frequency count 1
// and maximum error Δ = bcurrent- 1
}
// Space Reduction step...
if ( N == 0 mod w )
{ // Bucket boundary reached, cleanup the infrequent items !!
for each (si, fi, Δi,) ∈ D do
{
if ( fi + &Deltai ≤ bcurrent )
delete (si, fi, &Deltai) from D;
}
bcurrent++; // Next bucket...
}
}
Output all entries with fi ≥ (s - ε) × N
|