Photos, ramblings, whatever

Wednesday, November 25, 2009

evolution of spam filters

One of the many projects I'm coding right now is a spam filter. Well it's not just a spam filter: I'm pretty sure I'll reuse parts of it in some pretty diverse other things. Without further ado I'll share the prototype (in Vala)...
using GLib;

namespace tta {
public static delegate void sexecF(short value, void* target);
public struct setWordFunc {
sexecF exec;
void* target;
}
public static delegate short gexecF(void * target);
public struct getWordFunc {
gexecF exec;
void * target;
}

public class VM : GLib.Object {
private setWordFunc[] outputMaps;
private getWordFunc[] inputMaps;
private short[] bytes;
private int pc = 1;
private bool state = false;
public int timeout { get; set; }

public VM(int size) {
bytes = new short[size];
outputMaps = new setWordFunc[size];
inputMaps = new getWordFunc[size];
timeout = -1; // if negative, leave alone. If positive, decriment each cycle until 0,
// then halt.
}

public VM.array(short[] ram) {
bytes = ram;
outputMaps = new setWordFunc[bytes.length];
inputMaps = new getWordFunc[bytes.length];
}

public void mapInput(getWordFunc input, int address) {
inputMaps[address % inputMaps.length] = input;
}

public void mapOutput(setWordFunc output, int address) {
outputMaps[address % outputMaps.length] = output;
}

public short getWord(int address) {
int a = address % bytes.length;
a = a < b =" address" b =" b" pc =" value" state =" false;" pc =" 1;" state =" true;" state ="=" i =" getWord(pc);">> 8) & 0x00FF, i & 0x00FF);
pc = (pc + 1) % bytes.length;
if(timeout > 0) {
timeout--;
}
}
}

public void execute(int src, int dest) {
setWord(dest, getWord(src));
}
}

public class ALU : GLib.Object {
private short a = 0;
private short b = 0;

public static void setOperandA(short value, void* target) {
((ALU)target).a = value;
}

public static void setOperandB(short value, void* target) {
((ALU)target).b = value;
}

public static short add(void* target) {
var x = (ALU)target;
return x.a + x.b;
}

public static short subtract(void* target) {
var x = (ALU)target;
return x.a + x.b;
}

public static short multiply(void* target) {
var x = (ALU)target;
return x.a * x.b;
}

public static short modulus(void* target) {
var x = (ALU)target;
return x.a % x.b;
}

public static short divide(void * target) {
var x = (ALU)target;
return ( x.b == 0 ) ? 0 : ( x.a / x.b);
}

public static short gt(void* target) {
var x = (ALU)target;
return (short)(x.b > x.a);
}

public static short eq(void* target) {
var x = (ALU)target;
return (short)(x.b == x.a);
}
}

}

namespace evospam {
class Bot : GLib.Object {
public int score { get; set; }
private short[] dna = new short[512];

public Bot() {
for(int i = 0; i < i =" 0;"> 0.1) {
dna[i] = parent.dna[i];
} else {
dna[i] = (short)(GLib.Random.next_int() % short.MAX);
}
}
}

private static struct retParam {
short* retptr;
tta.VM haltTgt;
}
public static void ret(short value, void* target) {
retParam* a = (retParam*)target;
*(a->retptr) = value;
tta.VM.halt(value, a->haltTgt);
}

public static short readStream(void* target) {
unowned GLib.FileStream io = (GLib.FileStream)target;
if(!io.eof()) {
return (short)io.getc();
} else {
return 0;
}
}

public short run(GLib.FileStream message) {
var tmp = new short[512];
var alu = new tta.ALU();
GLib.Memory.copy(tmp, dna, 1023);
var vm = new tta.VM.array(tmp);
vm.mapInput({alu.add, alu}, 506);
vm.mapInput({alu.subtract, alu}, 507);
vm.mapInput({alu.multiply, alu}, 508);
vm.mapInput({alu.divide, alu}, 509);
vm.mapInput({alu.gt, alu}, 510);
vm.mapInput({alu.eq, alu}, 511);
short r = 0;
retParam xt = {&r, vm};
vm.mapOutput({ret, &xt}, 0);
vm.mapInput({readStream, message}, 0);
vm.timeout = 4000000;
vm.run();
return r;
}
}
}

int evaluate(evospam.Bot b) {
try{
GLib.Dir ham = GLib.Dir.open("ham");
GLib.Dir spam = GLib.Dir.open("spam");
int[] score = {0, 0};
int finalscore;
for(int c = 0; c < cd =" c" filename =" cd.read_name();" guess =" b.run(FileStream.open(" c ="=" s =" guess" s =" s" filename =" cd.read_name();" finalscore =" score[0]" score =" finalscore;" bots =" new" top_score =" 0;" i =" 0;" score =" evaluate(bots[i]);" top_score =" score"> top_score ? score : top_score;
}
stdout.printf("Dummy run top score = %d\n", top_score);
int counter = 0;
int overallCounter = 0;
while(top_score < lowest =" 0;" lowscore =" bots[0].score;" i =" 1;" score ="=" lowscore =" bots[i].score;" lowest =" i;">= 300) {
stdout.printf("...(lowest this round: %d)\n", lowscore);
counter = 0;
}
if(top_score == 0) {
bots[lowest] = new evospam.Bot();
} else {
bots[lowest] = new evospam.Bot.mutate(bots[GLib.Random.next_int() % 300]);
}
lowscore = evaluate(bots[lowest]);

if(lowscore > top_score) {
stdout.printf("Highscore: %d\n", lowscore);
top_score = lowscore;
counter = 0;
}
counter++;
overallCounter++;
}
stdout.printf("Generations after initialization: %d\n", overallCounter);
return 0;
}




And here is the output from a fairly typical run:

Dummy run top score = 0
Highscore: 3
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
Highscore: 4
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
Highscore: 7
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 1)
...(lowest this round: 0)
...(lowest this round: 0)
Highscore: 8
...(lowest this round: 0)
...(lowest this round: 0)
Highscore: 13
...(lowest this round: 3)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 2)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
Highscore: 16
...(lowest this round: 4)
...(lowest this round: 4)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 7)
...(lowest this round: 7)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 10)
...(lowest this round: 0)
...(lowest this round: 10)
Highscore: 17
...(lowest this round: 9)
...(lowest this round: 10)
Highscore: 18
...(lowest this round: 10)
...(lowest this round: 0)
...(lowest this round: 3)
...(lowest this round: 13)
...(lowest this round: 13)
Highscore: 19
...(lowest this round: 10)
...(lowest this round: 0)
...(lowest this round: 0)
...(lowest this round: 15)
...(lowest this round: 16)
...(lowest this round: 3)
...(lowest this round: 13)
Highscore: 23
...(lowest this round: 0)
...(lowest this round: 15)
...(lowest this round: 0)
...(lowest this round: 10)




There is a possibility I will post this without waiting for the program to complete.

Before I go, I'll discuss briefly how it works and some of its features. The code in the tta namespace is actually a computer in its own right, although one which only exists in software and the mind of anyone reading the code. Its first unusual property is that it halts as soon as it outputs anything. That's because it's only being run in order to produce one number (which is then used to decide whether it thinks the input is a spam message). It's also rather unusual as a computer in that it only operates on 16-bit words, there are no bitwise operations and no pure shift operators. In spite of these limitations it's actually Turing-complete, and if the restriction of one output only is removed it can be used to compute anything that will fit in its registers (or anything at all if programmed to use external memory).
Its other unusual feature is that it only has one operation, so there is no need to waste any bits specifying which one is to be used for each instruction! The instruction is "copy". It's 16 bits wide, the first 8 bits are the register ID for the original, and the remaining 8 bits are the register ID for the copy destination. Most registers simply store the data, but one is used for reading the file and returning the evaluation, and some others are used to control the arithmetic logic unit. This kind of system is called a transport triggered architecture. I thought of it before I read about it, but I read about it before I started coding this one.

Enough about that: the evolutionary algorithm is pretty standard. It basically works by continually throwing out whichever candidate program appears to be the worst solution and replacing it with a random variation of a randomly selected superior. Some of the new random variations don't work at all, so they're replaced almost at once. Some work better than any of the others, and have a tendency to fill the entire population with their descendants until something even better comes along.

The fitness function used here is actually the most original part of the algorithm. It works as follows: count the number of real emails that it classifies as real email. Then count the number of spam emails it classifies as spam. The score is the lowest number of these. My first approach was to simply count the number of correctly classified emails, but that worked rather poorly because the programs that classified either everything as spam or everything as real email had a huge advantage over the ones that actually read the email.

As for the final performance: the best score I've seen so far is 25. The samples it's working with are 32 emails each of spam and genuine, so that 25 score has a minimum of 78% accuracy. The overall accuracy is probably better because the score is only based on the aspect of the task it performs worst at.
Admittedly: the trials so far only measure performance in classifying 64 messages. For all I know it will fall flat in a real-life situation, but the only way to find that out is to test it.

Labels:

0 Comments:

Post a Comment

Subscribe to Post Comments [Atom]

<< Home