#!/usr/bin/env perl

## Name: extrcat.pl
## Autor: Jens Steube
## Version: 1.01
##
## http://hashcat.net/tools/experiments/extrcat/

##
## configuration
##

my $reference_file = "rockyou-top50k.txt";

my $show_max = 1000;

my $avg_multi = 4;

my $password_max = 16;

##
## modules
##

use strict;
use warnings;

##
## globals
##

my $reference_db = {};
my $reference_db_total = 0;

my $target_db = {};
my $target_db_total = 0;

##
## check parameters
##

if ((scalar @ARGV != 1) && (scalar @ARGV != 2))
{
  die "usage: $0 infile [threshold]\n";
}

my $target_file = $ARGV[0];
my $threshold   = $ARGV[1] || $avg_multi;

##
## load reference file
##

if (! open (IN, $reference_file))
{
  print STDERR "Error: could not open '$reference_file'\n";

  exit (1);
}

while (my $line = <IN>)
{
  $line =~ s/\n$//;
  $line =~ s/\r$//;

  next if ($line =~ m/^[0-9]*$/); # only numbers
  next if ($line =~ m/^(.)\1+$/); # all chars the same

  if ($line =~ m/^\$HEX\[[0-9a-f]+\]$/)
  {
    if ((length ($line) % 2) == 0)
    {
      $line = pack ("H*", substr ($line, 5, length ($line) - 5 - 1));

      next if ($line =~ m/[\r\n]/);
    }
  }

  next if (length ($line) > $password_max);

  $reference_db_total += add_to_db ($line, length $line, $reference_db);
}

close (IN);

##
## load target file
##

if (! open (IN, $target_file))
{
  print STDERR "Error: could not open '$target_file'\n";

  exit (1);
}


while (my $line = <IN>)
{
  $line =~ s/\n$//;
  $line =~ s/\r$//;

  next if ($line =~ m/^[0-9]*$/); # only numbers
  next if ($line =~ m/^(.)\1+$/); # all chars the same

  if ($line =~ m/^\$HEX\[[0-9a-f]+\]$/)
  {
    if ((length ($line) % 2) == 0)
    {
      $line = pack ("H*", substr ($line, 5, length ($line) - 5 - 1));

      next if ($line =~ m/[\r\n]/);
    }
  }

  next if (length ($line) > $password_max);

  $target_db_total += add_to_db ($line, length $line, $target_db);
}

close (IN);

##
## pre process
##

my $pre_final_db;

for my $sub (sort { $target_db->{$b}->{"count"} <=> $target_db->{$a}->{"count"} } keys %{$target_db})
{
  my $target_count = $target_db->{$sub}->{"count"};

  next unless $target_count > 1;

  my $target_percent = ($target_count / $target_db_total) * 100;

  my $reference_percent = 0;

  if (exists $reference_db->{$sub})
  {
    my $reference_count = $reference_db->{$sub}->{"count"};

    $reference_percent = ($reference_count / $reference_db_total) * 100;
  }

  my $diff = $target_percent - $reference_percent;

  next if $diff <= 0;

  my $entry =
  {
    "count"   => $target_count,
    "string"  => $sub,
    "length"  => length $sub,
    "diff"    => $diff,
  };

  $pre_final_db->{$sub} = $entry;
}

undef $reference_db;
undef $target_db;

##
## throw away partial matching substrings, then build averages but only from longest substrings
##

my @final_db;

my @pre_final_keys = sort { $pre_final_db->{$b}->{"length"} <=> $pre_final_db->{$a}->{"length"} ||
                            $pre_final_db->{$b}->{"count"}  <=> $pre_final_db->{$a}->{"count"}  ||
                            $a cmp $b } keys %{$pre_final_db};

for (my $i = 0; $i < scalar @pre_final_keys; $i++)
{
  next if (is_partial ($i, \@pre_final_keys, $pre_final_db));

  my $sub = $pre_final_keys[$i];

  my $entry = $pre_final_db->{$sub};

  push (@final_db, $entry);

  last if ($show_max-- == 1);
}

##
## calculate averages
##

my $total_diff  = 0;
my $total_count = 0;

for my $entry (@final_db)
{
  $total_diff   += $entry->{"diff"};
  $total_count  += $entry->{"count"};
}

if (scalar (@final_db) < 1)
{
  exit (1);
}

my $avg_diff  = $total_diff  / scalar @final_db;
my $avg_count = $total_count / scalar @final_db;

##
## print
##

for my $entry (sort { $b->{"count"} <=> $a->{"count"} || $b->{"diff"} <=> $a->{"diff"} } @final_db)
{
  next unless $entry->{"diff"}  >= $avg_diff  * $threshold;
  next unless $entry->{"count"} >= $avg_count * $threshold;

  printf "%-32s count:%8d diff:%.6f\n",
    $entry->{"string"},
    $entry->{"count"},
    $entry->{"diff"};
}

exit (0);

##
## subs
##

sub add_to_db
{
  my $line = shift;
  my $len  = shift;
  my $db   = shift;

  my $line_lc = lc $line;

  return 0 if all_same_chars ($line_lc);

  return 0 unless all_ascii_chars ($line_lc);

  my $cnt = 0;

  for (my $i = 0; $i < $len; $i++)
  {
    for (my $j = 0; $j < $len - $i; $j++)
    {
      my $sub = substr ($line_lc, $i, $j + 1);

      $db->{$sub}->{"count"}++;

      $cnt++;
    }
  }

  return $cnt;
}

sub is_partial
{
  my $pos = shift;

  my $pre_final_keys_ref = shift;

  my $pre_final_db = shift;

  my $target_sub = $pre_final_keys_ref->[$pos];

  my $target_entry = $pre_final_db->{$target_sub};

  for (my $i = 0; $i < $pos; $i++)
  {
    my $prev_sub = $pre_final_keys_ref->[$i];

    my $prev_entry = $pre_final_db->{$prev_sub};

    next if (index ($prev_entry->{"string"}, $target_entry->{"string"}) == -1);

    return 1 if $prev_entry->{"diff"} == $target_entry->{"diff"};
  }

  return 0;
}

sub all_same_chars
{
  my $in = shift;

  my @word = split "", $in;

  my $in0 = shift @word;

  while (my $next = shift @word)
  {
    return 0 unless ($next eq $in0);
  }

  return 1;
}

sub all_ascii_chars
{
  my $in = shift;

  my @word = split "", $in;

  for (my $i = 0; $i < scalar @word; $i++)
  {
    return 0 if (ord ($word[$i]) <= 0x20);
    return 0 if (ord ($word[$i]) >= 0x80);
  }

  return 1;
}
