#!/usr/bin/perl

# Copyright (c) 2002-2020 Eduard Bloch <blade@debian.org>
#
# License: MIT (see COPYING file inside of the official package)

=pod

=head1 NAME

mail-expire - program to extract outdated messages from mbox files

=head1 SYNOPSIS

=over

=item B<mail-expire> I<AGE> I<FILES...>

=back

=head1 DESCRIPTION

B<mail-expire> is a small utility which only purpose is to help on keeping the
size of multiple mail folders as small as needed (by removing outdated
messages).

I<Maildir> and I<Mailbox> formats are supported for input, both types are mentioned
interchangeably in this manual page. Output goes to I<Mailbox> files.

The old messages are compressed with gzip or xz and stored in the file with
the name following the pattern I<MBOXNAME.MONTH_YEAR>.gz (or similar).  The
reference time for the output filename is calculated from the current time
minus number of days specified in the first parameter.

If a file of the same name is found, the default storage strategy is appending
a new compressed region to the existing file (unless B<-u> option is used).
This allows, for example, to run the expiration in a cron job each weekend,
while collecting all mails from (roughly) each month into dedicated buckets.
The drawback of this appending mode is a slightly reduced compression ratio,
the advantage is reduced disk usage all the time.
Also, such archives can later be recompressed separately to achive the highst
compression ratio (like: "mv x.xz x.old.xz && xzcat x.old.xz | xz -9 > x.xz &&
rm xz.old.xz").

=head2 SPACE REQUIREMENTS

For I<Mbox> input, additional disk space to store the amount of fresh messages plus a portion of the size of expired messages (depending on the output compression).

For I<Maildir> input, only additional space for a portion of the size of expired messages (depending on the output compression) is required.

=head1 OPTIONS

B<mail-expire> recognizes the following options:

=over

=item B<-u>

If an existing archive file with the expected filename is found, another filename is to be chosen. This happens by appending a suffix like (NUMBER) to the basename.

This is a legacy option, not compatible with any custom output command.

=item B<-t DIR>

Specifies a different target directory for storing expired mailbox files. Default is the current directory.

=item B<-T FILEBASE>

Explicit output target file name for all inputs (i.e. merging all expired mails into the same target mbox).
This mostly disables automated name generation (base name and date suffix),
only the compression suffix (-c option and related) is appended.

=item B<--dry-run>, B<-n> 

Only examine input files and directories, no mail data is moved around.

=item B<--delete>, B<-d>

Drop the old messages. No backup will be made, be careful with this option.

=item B<--keep>, B<-k>

Copy the expired messages to output files and keep them in the input file.

=item B<--purge=VALUE>, B<-p VALUE>

Specify how to remove I<input> files or folders after becoming empty. 0: never
remove, 1 (default): remove empty files but keep folders (also related
folders), 2: remove empty Mbox file and Maildir, 3: like 2 but also remove the
input file if it was empty.

=item B<-v LEVEL>

Verbosity level, where 0 produces no output except for errors, 1 some progress messages, 2 and more verbose progress messages.

=item B<-c output-shell-command>

A shell script fragment which is run in order to archive the data.
Default or current value can be displayed with -v2 option.

This command is very delicate, it might destroy the target file.
It might also be suspicious to shell injection attacks, quotes are important.

=item B<-X>

Shortcut for -c and -s options, compressing with XZ compressor (multiple
threads) and .xz file suffix.

=item B<-U>

Shortcut for -c and -s options, to not compress the output and not have an
additional suffix.

=item B<-l>

Lockless operation, input Mailboxes are assumed to be not touched by other
tools while I<mail-expire> runs. By default, it attempts to mimic the behavior
of I<procmail> to prevent it from file modifications. This option has no effect
on I<Maildir> type input. To be used with care, runtime conflicts with
I<procmail> or other tools B<MIGHT CAUSE DATA LOSS>.

=back

=head1 AUTHOR

This manual page was written by Eduard Bloch. Permission is granted to use,
copy, distribute and/or modify this document under the terms of the
I<MIT> license.

=cut

use strict;

use Getopt::Long qw(:config no_ignore_case bundling pass_through);
use Date::Parse;
use Date::Format;
use Fcntl;
use Mail::Mbox::MessageParser;
use File::Basename;

my $opt_u = 0;
my $opt_drop = 0;
my $opt_dryrun = 0;
my $opt_keep = 0;
my $verbosity = 1;
my $opt_tgtdir = undef;
my $opt_tgtfile = undef;
my $opt_cmd= '( test -x /usr/bin/pigz && /usr/bin/pigz -9 || gzip -9)';
my $opt_sfx = ".gz";
my $opt_lockless;

my $outStream; # current output stream
my $outFileName; # current output stream
my $origOutSize; # for rollback truncation

my $filename; # current input filename

my $now = time;
my $dateSfx;
my $opt_days; # threshold

sub main;
exit(main());


sub comp_close
{
    return 1 if !defined($outStream);

    my $ret = close($outStream);
    undef $outStream;
    return $ret;
}

sub make_outname {
    my $input = shift;
    my $datesuf = $dateSfx;
    # just the verbose output

    my $namebase;
    my $tgtdir = $opt_tgtdir ? "$opt_tgtdir/" : "";
    if (defined($opt_tgtfile))
    {
        $namebase = $opt_tgtfile;
    }
    else
    {
        if(defined($opt_tgtdir))
        {
            $namebase = "$opt_tgtdir/".basename($input);
        }
        else
        {
            $namebase = $input;
        }
    }

    my $ret = $namebase . (defined($opt_tgtfile) ? "" : ".$datesuf").$opt_sfx;
    return $ret if (defined($opt_tgtfile) || !$opt_u || !-s $ret);

    # legacy algorithm, make up some available name
    for(my $modnumber = 0; ; ++$modnumber)
    {
        my $testloc = "$namebase." . $dateSfx . "~$modnumber" . $opt_sfx;
        return $testloc if (-s $testloc);
    }
    die "No spare name could be found for $input (path too long?)";
}

# Open current output.
#
# Returns true if succeeded or already open, terminates on fatal errors.
sub open_target
{
    if($outStream)
    {
        return 1;
    }
    else
    {
        $outFileName = make_outname($filename);
        print "OO: FILENAME = $outFileName\n" if $verbosity > 1;
    }

    return 1 if ($opt_drop || $opt_dryrun);

    $ENV{"FILENAME"} = $outFileName;
    my @sb = stat($outFileName);
    $origOutSize = @sb ? $sb[7] : -1;

    if (! open($outStream, "|-", 'sh', '-c', $opt_cmd.' >> "$FILENAME"' ))
    {
        print STDERR "E: cannot run output delivery command: $opt_cmd\n";
        return 0;
    }
    return 1;
}


sub comp_write
{
    my $data = shift;

    return 1 if ($opt_drop || $opt_dryrun);
    open_target or return 0;

    print $outStream $data or return 0;
    return 1;
}

# analyze relevant date field
# Return: true if fresh
sub is_fresh
{
    my $emailref = shift;
    my $dateHintRef = shift;

    my @date_re = (
        '^(From)\s\S+\s+(.*)',
        '(^|\n)Delivery-date:\s+(\w.*)\s*$',
        '(^|\n)Date:\s+(\w.*)\s*$'
    );
    my $age_days = undef;

    foreach my $re (@date_re)
    {
        $$emailref =~ /$re/m or next;
        my $sDate = $2;
        next if !$sDate;
        my $mdate = str2time($sDate);

        print "DBG: $sDate via $re, parsed: $mdate\n" if $verbosity > 3;
        next if !$mdate;

        my $diff = int(($now - $mdate) / 86400);
        next if $diff < 0; # clock skew?

        if (!defined($age_days) || $diff < $age_days) 
        {
            $age_days = $diff;
            $$dateHintRef = $mdate if defined($dateHintRef);
        }
    }
    $age_days = 0 if !defined($age_days);
    if ($age_days <= $opt_days)
    {
        syswrite(STDOUT, "$age_days (new), ") if $verbosity > 2;
        return 1;
    }
    syswrite(STDOUT, "$age_days (old), ") if $verbosity > 2;
    return 0;
}

# process input from mbox data
# Returns number of expired entries, negative value on error
sub process_mbox
{
    my $inFD;
    my $tempFileName = shift;
    my $old_count=0;
    my $new_count=0;

    if (!open($inFD,$filename)) {
        syswrite(STDOUT,"$filename could not be opened, skipping");
        return;
    };
    if(flock($inFD,2|4)){
        # lock when not locked already by another process
        flock($inFD,2) || die "unexpected trouble on locking $filename";
    } else {
        # skip file
        close($inFD);
        syswrite(STDOUT,"$filename is locked by an other prozess, skipping.");
        return;
    };
    my $file_handle = new FileHandle($filename);
    my $folder_reader = new Mail::Mbox::MessageParser( {
            'file_name' => $filename,
            'file_handle' => $file_handle,
            'enable_cache' => 0
        } );

     #die "he? ".ref($folder_reader);
    #die ref($folder_reader);
    if (ref($folder_reader) ne "Mail::Mbox::MessageParser::Grep" && ref($folder_reader) ne "Mail::Mbox::MessageParser::Perl" ) {
        syswrite STDERR, "Unable to parse contents of $filename, skipping.\n" if $verbosity;
        return;
    }

    my $tempFD;
    sysopen($tempFD, $tempFileName, O_RDWR|O_EXCL|O_CREAT) || die "Error creating temporary file, move $tempFileName out of the way";
    while(!$folder_reader->end_of_file())
    {
        my $email = $folder_reader->read_next_email();
        print STDOUT length($$email)."\n" if ($email && $verbosity > 4);
        if (!is_fresh($email))
        {
            $old_count++;
            comp_write($$email) or return;
        }
        else
        {
            $new_count++;
            next if $opt_dryrun;

            next if defined(syswrite($tempFD, $$email));

            die "Failure while writting - disc full? Please resolve and remove $tempFileName";
        }
    }

    flock($inFD, 8);
    close($inFD);
    close($tempFD);

    if( ($old_count + $new_count) < 1) {
        syswrite STDOUT, "No changes, ignoring file\n" if $verbosity > 1;
        return;
    }

    return ($old_count, $new_count);
}

sub process_mdir
{
    my $refItems = shift;
    my $refRetire = shift;

    # XXX: this partly has overlapping meaning with the return array
    my $old_count = 0;
    my $new_count = 0;
    
    #print @$refItems;
    #die;
    foreach my $item (@$refItems)
    {
        if (-z $item)
        {
            $old_count++;
            push(@$refRetire, $item);
            next;
        }

        my $data;
        {
            open my $fh, '<', $item or next;
            local $/ = undef;
            $data = <$fh>;
            close $fh or next;
        }

        my $finDate = undef;
        if (is_fresh(\$data, \$finDate))
        {
            $new_count++;
            next;
        }

        $old_count++;
        push(@$refRetire, $item);

        $finDate = ctime($finDate);

        if ($data =~ /Return-path:\s+<([^>]+)>/m)
        {
            comp_write("From $1 $finDate") or return;
        }
        else
        {
            comp_write("From unknown\@unknown.unknown $finDate") or return;
        }
        comp_write($data) or return;
        comp_write("\n\n") or return;
    }

    return ($old_count, $new_count);
}

sub set_mail_lockfile
{
    return 1 if $opt_lockless;
    my $lockfile = shift;
    my $tmp = $lockfile.rand();
    open(nix,">$tmp");
    print nix "0";
    close nix or return 0;
    while(1)
    {
        if(link($tmp, $lockfile))
        {
            unlink $tmp;
            return 1;
        }
        print "W: found a lockfile for the mailbox ($lockfile), waiting for its release (removal)\n";
        sleep 1;
    }
}

sub help {
    my $msg = "Usage: $0 [ options ] DAYS FILES
where
DAYS is an integer specifying the maximum age of a mail in days and
FILES one or more mbox file(s).

Options:
 -u, --unique\tchoose different filenames if the target file already exists
 -d, --delete\tdrops the old messages. Be warned, no backup will be made!
 -n\t\tDry run, no mail is moved
 -k, --keep\tOld mails are extracted but not removed
 -t DIR\t\tnew target directory DIR
 -v LEVEL\tVerbosity level, 0..5, default: 1 (some messages)
 -c CMD\t\tOutput filter command (default: pick pigz or gzip)
 -s SFX\t\tOutput file suffix (default: .gz)
 -X\t\tMacro, presets output to use XZ compression.
 -U\t\tMacro, presets output to not use compression
 -l\t\tLockfree reading, input must not be changed inbetween!
 -h, --help\tThis help output
";
    die $msg if shift;
    print $msg;
    exit;
}

sub try_del_md_if_empty
{
    return if ! -d $filename;
    print shift if $verbosity;
    rmdir "$filename/cur";
    rmdir "$filename/tmp";
    rmdir "$filename/new";
    # should have succeeded if this all went as expected. Otherwise, restore
    # the maildir parts
    return if rmdir $filename;
    
    print "W: $filename not empty or IO error, cannot purge, restoring old state!\n" if $verbosity;
    mkdir "$filename/tmp";
    mkdir "$filename/cur";
    mkdir "$filename/new";
}

#### MAIN CODE ####

sub main
{
    my $opt_xz;
    my $opt_unc;
    my $opt_purge = 1;
    my $help;

    my %opts = (
        "t=s", \$opt_tgtdir, 
        "T=s", \$opt_tgtfile, 
        "d|delete", \$opt_drop,
        "help|h", \$help,
        "u|unique", \$opt_u,
        "v=i", \$verbosity,
        "c", \$opt_cmd,
        "s", \$opt_sfx,
        "X", \$opt_xz,
        "U", \$opt_unc,
        "l", \$opt_lockless,
        "n|dry-run", \$opt_dryrun,
        "k|keep", \$opt_keep,
        "p|purge=i", \$opt_purge
    );

    &help(1) if !GetOptions(%opts);
    &help(0) if $help;

    if ($opt_xz)
    {
        $opt_cmd = "xz -T0";
        $opt_sfx = ".xz";
    }
    elsif ($opt_unc)
    {
        $opt_cmd = "cat";
        $opt_sfx = "";
    }

    $opt_days = shift(@ARGV);
    die "Please specify a valid day count!\n" if abs($opt_days)<1;
    die "Please specify mbox file names!\n" if ! @ARGV;

    $dateSfx = time2str("%Y_%m", $now - $opt_days * 86400);

    print "RUNNING IN SIMULATION MODE (DRY-RUN)!\n" if ($opt_dryrun && $verbosity);

    if ($verbosity > 1) {
        print "OUTPUT PATTERN: ".make_outname("<FILE>")."\n";
        print "OUTPUT COMMAND: $opt_cmd\n";
    }
    my $lockfile;

    JOB: 
    foreach my $inarg (@ARGV) {
        my @st;
        my @time;

        $filename = $inarg;
        $filename =~ s,/+$,,;

        # safety cleanup
        comp_close();
        unlink $lockfile if $lockfile;

        my @mditems;
        my @procRes;
        my @ancient;

        push @mditems, <$filename/cur/*>;
        push @mditems, <$filename/new/*>;
        push @mditems, <$filename/tmp/*>;
        my $tempFileName = $filename."_~".rand;

        print "I: Processing $filename:\n" if $verbosity;
        print "MD items: ".@mditems."\n" if ($verbosity > 4);

        if (0 == @mditems && $opt_purge > 2)
        {
            try_del_md_if_empty("I: Dumping empty input Maildir $filename\n");
            next JOB;
        }

        my $isMdir = (-d "$filename/cur" || -d "$filename/new" || -d "$filename/tmp");

        if ($isMdir)
        {
            @procRes = process_mdir(\@mditems, \@ancient);
        }
        else
        {
            $lockfile = $filename.".lock";
            set_mail_lockfile($lockfile) or die "Error creating lockfiles in the source directory - consider -l option";

            if(-z $filename) {
                if (!$opt_keep && !$opt_dryrun && $opt_purge > 2)
                {
                    syswrite(STDERR, "Removing empty input file $filename.\n") if $verbosity;
                    unlink $filename;
                }
                else
                {
                    syswrite(STDERR, "Empty file $filename, skipping.\n") if $verbosity;
                }
                next JOB;
            };
            if(-e $tempFileName)
            { 
                syswrite(STDERR, "ERROR: Temporary file $tempFileName already exists, skipping $filename.\n");
                next JOB;
            };
            @procRes = process_mbox($tempFileName);
        }

        my $outState = comp_close();
        my $toWhere = " ($outFileName)" if $outFileName;

        my $old_count = @procRes ? $procRes[0] : -1;
        my $new_count = @procRes ? $procRes[1] : "MANY";

        if (@procRes && $verbosity)
        {
            print "I: $procRes[1] fresh, $old_count expired$toWhere\n";
        }
        
        # nothing extracted, or error, or error on storing?
        if ($old_count <= 0 || !$outState)
        {
            unlink $tempFileName if $tempFileName;
            truncate($outFileName, $origOutSize) if (!$opt_dryrun && !$opt_drop && defined($outFileName) && -e $outFileName && $origOutSize > 0);
            next JOB;
        }
        syswrite (STDOUT,"\nI: Copied $old_count expired entries$toWhere\n") if $verbosity > 1;

        # ok, some where extracted; keep only the fresh messages

        if($isMdir)
        {
            die "ERROR: old messages not reported but counted as $old_count" if ! @ancient;
            print "Removing ".scalar @ancient." old messages from $filename\n" if $verbosity > 1;
            if (!$opt_keep && !$opt_dryrun)
            {
                unlink @ancient;
                try_del_md_if_empty("I: Dumping (now empty) input Maildir $filename\n") if ($opt_purge >= 2 && $new_count == 0);
            }
        }
        else
        {
            my $tempname = $filename."_T".rand();
            print "\nI: Reinstating $filename from $tempFileName via $tempname\n" if $verbosity > 2;
            if (!$opt_keep && !$opt_dryrun)
            {
                if($new_count == 0 && $opt_purge > 0)
                {
                    print "Removing (now empty) input $filename\n" if $verbosity > 1;
                    unlink $filename;
                    next JOB;
                }

                rename($filename, $tempname) || die "Cannot rename old mbox to temp file $tempname";
                rename($tempFileName, $filename) || die "Cannot move $tempname into old location";
                unlink($tempname);
            }
        }
    }

    unlink $lockfile if $lockfile;

    return 0;
}

# vim: set sw=4 tw=78 nocin expandtab:
