many changes ;-)

Former-commit-id: cb4aea844e960e4af4dc673ebc8eec49a7d12b13
This commit is contained in:
2023-12-05 15:28:29 +01:00
parent 03bef6461d
commit 0f8066d367
39 changed files with 951797 additions and 198 deletions

198
Example_Arth03.ngsfilter Normal file
View File

@ -0,0 +1,198 @@
CIRA_Arth03 CIRA001_A acacacac:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01A;
CIRA_Arth03 CIRA002_A acagcaca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01B;
CIRA_Arth03 CIRA003_A gtgtacat:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01C;
CIRA_Arth03 CIRA004_A tatgtcag:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01D;
CIRA_Arth03 CIRA005_A tagtcgca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01E;
CIRA_Arth03 CIRA006_A tactatac:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01F;
CIRA_Arth03 CIRA007_A actagatc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01G;
CIRA_Arth03 BLNK001 gatcgcga:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01H;
CIRA_Arth03 CIRA008_A acacacac:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02A;
CIRA_Arth03 CIRA009_A acagcaca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02B;
CIRA_Arth03 CIRA010_A gtgtacat:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02C;
CIRA_Arth03 CPCR01_A tatgtcag:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02D;
CIRA_Arth03 CIRA011_A tagtcgca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02E;
CIRA_Arth03 CIRA012_A tactatac:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02F;
CIRA_Arth03 BLNK002 actagatc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02G;
CIRA_Arth03 CIRA013_A gatcgcga:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02H;
CIRA_Arth03 CIRA014_A acacacac:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03A;
CIRA_Arth03 CIRA015_A acagcaca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03B;
CIRA_Arth03 CIRA016_A gtgtacat:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03C;
CIRA_Arth03 CIRA017_A tatgtcag:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03D;
CIRA_Arth03 CIRA018_A tagtcgca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03E;
CIRA_Arth03 BLNK003 tactatac:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03F;
CIRA_Arth03 CIRA019_A actagatc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03G;
CIRA_Arth03 CIRA020_A gatcgcga:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03H;
CIRA_Arth03 CIRA021_A acacacac:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04A;
CIRA_Arth03 CIRA022_A acagcaca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04B;
CIRA_Arth03 CIRA023_A gtgtacat:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04C;
CIRA_Arth03 CIRA024_A tatgtcag:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04D;
CIRA_Arth03 BLNK004 tagtcgca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04E;
CIRA_Arth03 CIRA025_A tactatac:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04F;
CIRA_Arth03 CIRA026_A actagatc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04G;
CIRA_Arth03 CIRA027_A gatcgcga:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04H;
CIRA_Arth03 CPOS232_A acacacac:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05A;
CIRA_Arth03 CIRA028_A acagcaca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05B;
CIRA_Arth03 CIRA029_A gtgtacat:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05C;
CIRA_Arth03 BLNK005 tatgtcag:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05D;
CIRA_Arth03 CIRA030_A tagtcgca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05E;
CIRA_Arth03 CIRA031_A tactatac:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05F;
CIRA_Arth03 CIRA032_A actagatc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05G;
CIRA_Arth03 CIRA033_A gatcgcga:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05H;
CIRA_Arth03 CIRA034_A acacacac:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06A;
CIRA_Arth03 CIRA035_A acagcaca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06B;
CIRA_Arth03 BLNK006 gtgtacat:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06C;
CIRA_Arth03 CIRA036_A tatgtcag:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06D;
CIRA_Arth03 CIRA037_A tagtcgca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06E;
CIRA_Arth03 CIRA038_A tactatac:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06F;
CIRA_Arth03 CIRA039_A actagatc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06G;
CIRA_Arth03 CIRA040_A gatcgcga:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06H;
CIRA_Arth03 CIRA041_A acacacac:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07A;
CIRA_Arth03 BLNK007 acagcaca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07B;
CIRA_Arth03 CIRA042_A gtgtacat:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07C;
CIRA_Arth03 CIRA043_A tatgtcag:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07D;
CIRA_Arth03 CIRA044_A tagtcgca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07E;
CIRA_Arth03 CIRA045_A tactatac:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07F;
CIRA_Arth03 CIRA046_A actagatc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07G;
CIRA_Arth03 CIRA047_A gatcgcga:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07H;
CIRA_Arth03 BLNK008 acacacac:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08A;
CIRA_Arth03 CIRA048_A acagcaca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08B;
CIRA_Arth03 CIRA049_A gtgtacat:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08C;
CIRA_Arth03 CIRA050_A tatgtcag:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08D;
CIRA_Arth03 CIRA051_A tagtcgca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08E;
CIRA_Arth03 CPCR02_A tactatac:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08F;
CIRA_Arth03 CIRA052_A actagatc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08G;
CIRA_Arth03 CIRA053_A gatcgcga:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08H;
CIRA_Arth03 CIRA054_A acacacac:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_09A;
CIRA_Arth03 CPOS241_A acagcaca:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_09B;
CIRA_Arth03 CIRA001_B cgctctcg:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01A;
CIRA_Arth03 CIRA002_B gtcgtaga:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01B;
CIRA_Arth03 CIRA003_B gtcacgtc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01C;
CIRA_Arth03 CIRA004_B gactgatg:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01D;
CIRA_Arth03 CIRA005_B agactatg:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01E;
CIRA_Arth03 CIRA006_B gcgtcagc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01F;
CIRA_Arth03 CIRA007_B tgacatca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01G;
CIRA_Arth03 BLNK009 acatgtgt:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01H;
CIRA_Arth03 CIRA008_B cgctctcg:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02A;
CIRA_Arth03 CIRA009_B gtcgtaga:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02B;
CIRA_Arth03 CIRA010_B gtcacgtc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02C;
CIRA_Arth03 CPCR01_B gactgatg:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02D;
CIRA_Arth03 CIRA011_B agactatg:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02E;
CIRA_Arth03 CIRA012_B gcgtcagc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02F;
CIRA_Arth03 BLNK010 tgacatca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02G;
CIRA_Arth03 CIRA013_B acatgtgt:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02H;
CIRA_Arth03 CIRA014_B cgctctcg:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03A;
CIRA_Arth03 CIRA015_B gtcgtaga:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03B;
CIRA_Arth03 CIRA016_B gtcacgtc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03C;
CIRA_Arth03 CIRA017_B gactgatg:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03D;
CIRA_Arth03 CIRA018_B agactatg:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03E;
CIRA_Arth03 BLNK011 gcgtcagc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03F;
CIRA_Arth03 CIRA019_B tgacatca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03G;
CIRA_Arth03 CIRA020_B acatgtgt:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03H;
CIRA_Arth03 CIRA021_B cgctctcg:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04A;
CIRA_Arth03 CIRA022_B gtcgtaga:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04B;
CIRA_Arth03 CIRA023_B gtcacgtc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04C;
CIRA_Arth03 CIRA024_B gactgatg:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04D;
CIRA_Arth03 BLNK012 agactatg:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04E;
CIRA_Arth03 CIRA025_B gcgtcagc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04F;
CIRA_Arth03 CIRA026_B tgacatca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04G;
CIRA_Arth03 CIRA027_B acatgtgt:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04H;
CIRA_Arth03 CPOS232_B cgctctcg:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05A;
CIRA_Arth03 CIRA028_B gtcgtaga:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05B;
CIRA_Arth03 CIRA029_B gtcacgtc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05C;
CIRA_Arth03 BLNK013 gactgatg:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05D;
CIRA_Arth03 CIRA030_B agactatg:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05E;
CIRA_Arth03 CIRA031_B gcgtcagc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05F;
CIRA_Arth03 CIRA032_B tgacatca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05G;
CIRA_Arth03 CIRA033_B acatgtgt:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05H;
CIRA_Arth03 CIRA034_B cgctctcg:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06A;
CIRA_Arth03 CIRA035_B gtcgtaga:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06B;
CIRA_Arth03 BLNK014 gtcacgtc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06C;
CIRA_Arth03 CIRA036_B gactgatg:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06D;
CIRA_Arth03 CIRA037_B agactatg:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06E;
CIRA_Arth03 CIRA038_B gcgtcagc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06F;
CIRA_Arth03 CIRA039_B tgacatca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06G;
CIRA_Arth03 CIRA040_B acatgtgt:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06H;
CIRA_Arth03 CIRA041_B cgctctcg:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07A;
CIRA_Arth03 BLNK015 gtcgtaga:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07B;
CIRA_Arth03 CIRA042_B gtcacgtc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07C;
CIRA_Arth03 CIRA043_B gactgatg:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07D;
CIRA_Arth03 CIRA044_B agactatg:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07E;
CIRA_Arth03 CIRA045_B gcgtcagc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07F;
CIRA_Arth03 CIRA046_B tgacatca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07G;
CIRA_Arth03 CIRA047_B acatgtgt:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07H;
CIRA_Arth03 BLNK016 cgctctcg:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08A;
CIRA_Arth03 CIRA048_B gtcgtaga:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08B;
CIRA_Arth03 CIRA049_B gtcacgtc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08C;
CIRA_Arth03 CIRA050_B gactgatg:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08D;
CIRA_Arth03 CIRA051_B agactatg:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08E;
CIRA_Arth03 CPCR02_B gcgtcagc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08F;
CIRA_Arth03 CIRA052_B tgacatca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08G;
CIRA_Arth03 CIRA053_B acatgtgt:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08H;
CIRA_Arth03 CIRA054_B cgctctcg:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_09A;
CIRA_Arth03 CPOS241_B gtcgtaga:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_09B;
CIRA_Arth03 CIRA002_C atgatcgc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01B;
CIRA_Arth03 CIRA003_C acgacgag:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01C;
CIRA_Arth03 CIRA004_C catcagtc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01D;
CIRA_Arth03 CIRA005_C atcagtca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01E;
CIRA_Arth03 CIRA006_C tctactga:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01F;
CIRA_Arth03 CIRA007_C gatgatct:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01G;
CIRA_Arth03 CIRA009_C atgatcgc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02B;
CIRA_Arth03 CIRA010_C acgacgag:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02C;
CIRA_Arth03 CPCR01_C catcagtc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02D;
CIRA_Arth03 CIRA011_C atcagtca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02E;
CIRA_Arth03 CIRA012_C tctactga:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02F;
CIRA_Arth03 BLNK018 gatgatct:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02G;
CIRA_Arth03 CIRA015_C atgatcgc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03B;
CIRA_Arth03 CIRA016_C acgacgag:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03C;
CIRA_Arth03 CIRA017_C catcagtc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03D;
CIRA_Arth03 CIRA018_C atcagtca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03E;
CIRA_Arth03 BLNK019 tctactga:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03F;
CIRA_Arth03 CIRA019_C gatgatct:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03G;
CIRA_Arth03 CIRA022_C atgatcgc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04B;
CIRA_Arth03 CIRA023_C acgacgag:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04C;
CIRA_Arth03 CIRA024_C catcagtc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04D;
CIRA_Arth03 BLNK020 atcagtca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04E;
CIRA_Arth03 CIRA025_C tctactga:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04F;
CIRA_Arth03 CIRA026_C gatgatct:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04G;
CIRA_Arth03 CIRA028_C atgatcgc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05B;
CIRA_Arth03 CIRA029_C acgacgag:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05C;
CIRA_Arth03 BLNK021 catcagtc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05D;
CIRA_Arth03 CIRA030_C atcagtca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05E;
CIRA_Arth03 CIRA031_C tctactga:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05F;
CIRA_Arth03 CIRA032_C gatgatct:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05G;
CIRA_Arth03 CIRA035_C atgatcgc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06B;
CIRA_Arth03 BLNK022 acgacgag:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06C;
CIRA_Arth03 CIRA036_C catcagtc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06D;
CIRA_Arth03 CIRA037_C atcagtca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06E;
CIRA_Arth03 CIRA038_C tctactga:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06F;
CIRA_Arth03 CIRA039_C gatgatct:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06G;
CIRA_Arth03 BLNK023 atgatcgc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07B;
CIRA_Arth03 CIRA042_C acgacgag:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07C;
CIRA_Arth03 CIRA043_C catcagtc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07D;
CIRA_Arth03 CIRA044_C atcagtca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07E;
CIRA_Arth03 CIRA045_C tctactga:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07F;
CIRA_Arth03 CIRA046_C gatgatct:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07G;
CIRA_Arth03 CIRA048_C atgatcgc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08B;
CIRA_Arth03 CIRA049_C acgacgag:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08C;
CIRA_Arth03 CIRA050_C catcagtc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08D;
CIRA_Arth03 CIRA051_C atcagtca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08E;
CIRA_Arth03 CPCR02_C tctactga:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08F;
CIRA_Arth03 CIRA052_C gatgatct:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08G;
CIRA_Arth03 CPOS241_C atgatcgc:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_09B;
CIRA_Arth03 CIRA001_C acacacac:agactatg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_01A;
CIRA_Arth03 BLNK017 gatcgcga:agactatg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_01H;
CIRA_Arth03 CIRA008_C acacacac:gcgtcagc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_02A;
CIRA_Arth03 CIRA013_C gatcgcga:gcgtcagc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_02H;
CIRA_Arth03 CIRA014_C acacacac:tgacatca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_03A;
CIRA_Arth03 CIRA020_C gatcgcga:tgacatca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_03H;
CIRA_Arth03 CIRA021_C acacacac:acatgtgt GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_04A;
CIRA_Arth03 CIRA027_C gatcgcga:acatgtgt GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_04H;
CIRA_Arth03 CPOS232_C acacacac:gtacgact GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_05A;
CIRA_Arth03 CIRA033_C gatcgcga:gtacgact GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_05H;
CIRA_Arth03 CIRA034_C acacacac:atgatcgc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_06A;
CIRA_Arth03 CIRA040_C gatcgcga:atgatcgc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_06H;
CIRA_Arth03 CIRA041_C acacacac:acgacgag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_07A;
CIRA_Arth03 CIRA047_C gatcgcga:acgacgag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_07H;
CIRA_Arth03 BLNK024 acacacac:catcagtc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_08A;
CIRA_Arth03 CIRA053_C gatcgcga:catcagtc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_08H;
CIRA_Arth03 CIRA054_C acacacac:atcagtca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_09A;

409337
SPER01.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
412536492810152d7835808871ea2b0289a770f7

BIN
doc/.DS_Store vendored

Binary file not shown.

View File

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

6
doc/book/Untitled.ipynb Normal file
View File

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,118 @@
#!/bin/bash
#!/bin/bash
#OAR -n gbsort
##OAR --array 50
##OAR --array-param-file 50_first.tsv
#OAR --project phyloalps
#OAR -l nodes=1/core=10,walltime=24:00:00
#OAR -O gbsort.%jobid%.log
#OAR -E gbsort.%jobid%.log
# /silenus/PROJECTS/pr-phyloalps/coissac
# /bettik/LECA/ENVIRONMENT/data/biodatabase/genbank
#
# Used resources URLs
#
NCBIURL="https://ftp.ncbi.nlm.nih.gov/" # NCBI Web site URL
GBURL="${NCBIURL}genbank/" # Directory of Genbank flat files
TAXOURL="${NCBIURL}pub/taxonomy/taxdump.tar.gz" # NCBI Taxdump
LOGFILE="download.log"
#
# List of downloaded Genbank divisions
#
DIV="bct|inv|mam|phg|pln|pri|rod|vrl|vrt"
############################
#
# Functions
#
############################
pattern_at_rank() {
local taxo="$1"
local rank="$2"
echo "^($(awk -F "|" -v rank="$rank" 'BEGIN {
ORS="|";
rank="\t" rank "\t"
}
($3 ~ rank) {sub(/^[ \t]+/,"",$1);
sub(/[ \t]+$/,"",$1);
print $1}
' "${taxo}/nodes.dmp" \
| sed 's/|$//'))$"
}
GBDIR=$1
#
# Extrate from the web site the current Genbank release number
# end create the corresponding directory
#
echo "Looking at current Genbank release number"
GB_Release_Number=$(for r in $(ls -d "${GBDIR}/Release-"* ); do
basename $r;
done \
| sort -r \
| head -1 \
| sed 's/^Release-//')
GB_Release_Number=251
echo "identified latest release number is : ${GB_Release_Number}"
GBSOURCE="${GBDIR}/Release-${GB_Release_Number}"
mkdir -p "Release-${GB_Release_Number}"
cd "Release-${GB_Release_Number}" || exit
#
# Download the current NCBI taxonomy
#
mkdir -p "ncbitaxo"
if [[ ! -f ncbitaxo/nodes.dmp ]] || [[ ! -f ncbitaxo/names.dmp ]] ; then
curl "${TAXOURL}" \
| tar -C "ncbitaxo" -zxf -
fi
for f in $(ls -1 "${GBSOURCE}/"*.seq.gz ) ; do
echo "PROCESSING : $f saved into $fasta" $(pwd)
obiannotate --genbank -t ncbitaxo \
--with-taxon-at-rank kingdom \
--with-taxon-at-rank superkingdom \
--with-taxon-at-rank phylum\
--with-taxon-at-rank order \
--with-taxon-at-rank family \
--with-taxon-at-rank genus \
-S division='"misc-@-0"' \
-S section='"misc-@-0"' \
"$f" \
| obigrep -A genus_taxid -A family_taxid \
| obigrep -p 'annotations.genus_taxid > 0 && annotations.family_taxid > 0' \
-p 'annotations.phylum_taxid > 0 || annotations.order_taxid > 0' \
| obiannotate -p 'annotations.superkingdom_taxid > 0' \
-S division='printf("%s-S-%d",subspc(annotations.superkingdom_name),annotations.superkingdom_taxid)' \
| obiannotate -p 'annotations.kingdom_taxid > 0' \
-S division='printf("%s-K-%d",subspc(annotations.kingdom_name),annotations.kingdom_taxid)' \
| obiannotate -p 'annotations.phylum_taxid > 0' \
-S section='printf("%s-P-%d",subspc(annotations.phylum_name),annotations.phylum_taxid)' \
| obiannotate -p 'annotations.order_taxid > 0' \
-S section='printf("%s-O-%d",subspc(annotations.order_name),annotations.order_taxid)' \
| obidistribute -Z -A -p "%s.fasta" -c section -d division
done

31
obitools4/Dockerfile Normal file
View File

@ -0,0 +1,31 @@
FROM ubuntu:lunar as builder
LABEL dockerfile.version="1"
LABEL software="obitools4"
WORKDIR /
RUN apt update --fix-missing && apt upgrade -y
RUN apt install -y build-essential
RUN apt install -y git tcsh bash gawk parallel gettext zlib1g-dev libglib2.0-0
# RUN git clone https://git.metabarcoding.org/org-asm/org-annotate.git
# RUN cd org-annotate/src && make && cd ../..
# RUN cd /org-annotate/data/its/ITSx_db/HMMs && \
# rm *.h3* && \
# for f in *.hmm ; do /org-annotate/ports/i386-linux/bin/hmmpress $f ; done
RUN apt install -y python3-pip python3-dev python3-venv
RUN git clone https://git.metabarcoding.org/org-asm/org-asm.git
RUN cd org-asm
RUN python3 -m venv ../org-assembler
RUN bash -c '../org-assembler/bin/pip3 install -r /org-asm/requirements.txt'
RUN bash -c '../org-assembler/bin/python3 setup.py install --no-serenity'
RUN cd ..
# FROM ubuntu:lunar as phyloskims
# WORKDIR /
# RUN apt update --fix-missing && apt upgrade -y
# RUN apt install -y tcsh bash gawk parallel zlib1g libglib2.0-0
# COPY --from=builder /org-annotate /org-annotate
# RUN rm -rf /org-annotate/src
# COPY --from=builder /org-assembler /org-assembler
# RUN mkdir -p /data

View File

@ -0,0 +1,2 @@
package obiformats

View File

@ -3,93 +3,190 @@ package obistats
import (
"math"
"sync"
"time"
"golang.org/x/exp/rand"
"gonum.org/v1/gonum/stat/sampleuv"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
// AssignToClass applies the nearest neighbor algorithm to assign data points to classes.
//
// Parameters:
// - data: a 2D slice of float64 representing the data points to be assigned.
// - centers: a 2D slice of float64 representing the center points for each class.
//
// Return:
// - classes: a slice of int representing the assigned class for each data point.
func AssignToClass(data, centers *obiutils.Matrix[float64]) []int {
classes := make([]int, len(*data))
numData := len(*data)
numCenters := len(*centers)
var wg sync.WaitGroup
wg.Add(numData)
for i := 0; i < numData; i++ {
go func(i int) {
defer wg.Done()
minDist := math.MaxFloat64
minDistIndex := -1
rowData := (*data)[i]
for j := 0; j < numCenters; j++ {
centerData := (*centers)[j]
dist := 0.0
for d, val := range rowData {
diff := val - centerData[d]
dist += diff * diff
func squareDist(a, b []float64) float64 {
sum := 0.0
for i := 0; i < len(a); i++ {
diff := a[i] - b[i]
sum += diff * diff
}
if dist < minDist {
minDist = dist
minDistIndex = j
}
}
classes[i] = minDistIndex
}(i)
}
wg.Wait()
return classes
return sum
}
// ComputeCenters calculates the centers of clusters for a given data set.
//
// Parameters:
// - data: a pointer to a matrix of float64 values representing the data set.
// - k: an integer representing the number of clusters.
// - classes: a slice of integers representing the assigned cluster for each data point.
//
// Returns:
// - centers: a pointer to a matrix of float64 values representing the centers of the clusters.
// ComputeCenters calculates the centers of clusters for a given data set.
//
// Parameters:
// - data: a pointer to a matrix of float64 values representing the data set.
// - k: an integer representing the number of clusters.
// - classes: a slice of integers representing the assigned cluster for each data point.
//
// Returns:
// - centers: a pointer to a matrix of float64 values representing the centers of the clusters.
func ComputeCenters(data *obiutils.Matrix[float64], k int, classes []int) *obiutils.Matrix[float64] {
centers := obiutils.Make2DNumericArray[float64](k, len((*data)[0]), true)
ns := make([]int, k)
func DefaultRG() *rand.Rand {
return rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
}
var wg sync.WaitGroup
type KmeansClustering struct {
data *obiutils.Matrix[float64]
rg *rand.Rand
centers obiutils.Matrix[float64]
icenters []int
sizes []int
distmin []float64
classes []int
}
for i := range ns {
ns[i] = 0
func MakeKmeansClustering(data *obiutils.Matrix[float64], k int, rg *rand.Rand) *KmeansClustering {
distmin := make([]float64, len(*data))
for i := 0; i < len(distmin); i++ {
distmin[i] = math.MaxFloat64
}
clustering := &KmeansClustering{
data: data,
icenters: make([]int, 0, k),
sizes: make([]int, 0, k),
centers: make(obiutils.Matrix[float64], 0, k),
distmin: distmin,
classes: make([]int, len(*data)),
rg: rg,
}
for i := 0; i < k; i++ {
clustering.AddACenter()
}
return clustering
}
// K returns the number of clusters in the K-means clustering algorithm.
//
// No parameters.
// Returns an integer.
func (clustering *KmeansClustering) K() int {
return len(clustering.icenters)
}
// N returns the size of the dataset in the KmeansClustering instance.
//
// It does not take any parameters.
// The return type is an integer.
func (clustering *KmeansClustering) N() int {
return len(*clustering.data)
}
// Dimension returns the dimension of the KmeansClustering data.
//
// No parameters.
// Returns an integer representing the dimension of the data.
func (clustering *KmeansClustering) Dimension() int {
return len((*clustering.data)[0])
}
func (clustering *KmeansClustering) AddACenter() {
C := 0
if clustering.K() == 0 {
C = rand.Intn(clustering.N())
} else {
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
C, _ = w.Take()
}
clustering.icenters = append(clustering.icenters, C)
clustering.sizes = append(clustering.sizes, 0)
center := (*clustering.data)[C]
clustering.centers = append(clustering.centers, center)
n := clustering.N()
for i := 0; i < n; i++ {
d := squareDist((*clustering.data)[i], center)
if d < clustering.distmin[i] {
clustering.distmin[i] = d
}
}
}
// ResetEmptyCenters resets the empty centers in the KmeansClustering struct.
//
// It iterates over the centers and checks if their corresponding sizes are zero.
// If a center is empty, a new weighted sample is taken with the help of the distmin and rg variables.
// The new center is then assigned to the empty center index, and the sizes and centers arrays are updated accordingly.
// Finally, the function returns the number of empty centers that were reset.
func (clustering *KmeansClustering) ResetEmptyCenters() int {
nreset := 0
for i := 0; i < clustering.K(); i++ {
if clustering.sizes[i] == 0 {
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
C, _ := w.Take()
clustering.icenters[i] = C
clustering.centers[i] = (*clustering.data)[C]
nreset++
}
}
return nreset
}
// AssignToClass assigns each data point to a class based on the distance to the nearest center.
//
// This function does not take any parameters.
// It does not return anything.
func (clustering *KmeansClustering) AssignToClass() {
var wg sync.WaitGroup
var lock sync.Mutex
for i := 0; i < clustering.K(); i++ {
clustering.sizes[i] = 0
}
for i := 0; i < clustering.N(); i++ {
clustering.distmin[i] = math.MaxFloat64
}
goroutine := func(i int) {
defer wg.Done()
dmin := math.MaxFloat64
cmin := -1
for j, center := range clustering.centers {
dist := squareDist((*clustering.data)[i], center)
if dist < dmin {
dmin = dist
cmin = j
}
}
lock.Lock()
clustering.classes[i] = cmin
clustering.sizes[cmin]++
clustering.distmin[i] = dmin
lock.Unlock()
}
wg.Add(clustering.N())
for i := 0; i < clustering.N(); i++ {
go goroutine(i)
}
nreset := clustering.ResetEmptyCenters()
if nreset > 0 {
log.Warnf("Reset %d empty centers", nreset)
clustering.AssignToClass()
}
}
// ComputeCenters calculates the centers of the K-means clustering algorithm.
//
// It takes no parameters.
// It does not return any values.
func (clustering *KmeansClustering) ComputeCenters() {
var wg sync.WaitGroup
centers := clustering.centers
data := clustering.data
classes := clustering.classes
k := clustering.K()
// Goroutine code
goroutine := func(centerIdx int) {
goroutine1 := func(centerIdx int) {
defer wg.Done()
for j, row := range *data {
class := classes[j]
if class == centerIdx {
ns[centerIdx]++
for l, val := range row {
centers[centerIdx][l] += val
}
@ -99,149 +196,168 @@ func ComputeCenters(data *obiutils.Matrix[float64], k int, classes []int) *obiut
for i := 0; i < k; i++ {
wg.Add(1)
go goroutine(i)
go goroutine1(i)
}
wg.Wait()
for i := range centers {
for j := range centers[i] {
centers[i][j] /= float64(ns[i])
centers[i][j] /= float64(clustering.sizes[i])
}
}
return &centers
}
// ComputeInertia computes the inertia of the given data and centers in parallel.
//
// Parameters:
// - data: A pointer to a Matrix of float64 representing the data.
// - classes: A slice of int representing the class labels for each data point.
// - centers: A pointer to a Matrix of float64 representing the centers.
//
// Return type:
// - float64: The computed inertia.
func ComputeInertia(data *obiutils.Matrix[float64], classes []int, centers *obiutils.Matrix[float64]) float64 {
inertia := make(chan float64)
numRows := len(*data)
wg := sync.WaitGroup{}
wg.Add(numRows)
for i := 0; i < numRows; i++ {
go func(i int) {
goroutine2 := func(centerIdx int) {
defer wg.Done()
row := (*data)[i]
class := classes[i]
center := (*centers)[class]
inertiaLocal := 0.0
for j, val := range row {
diff := val - center[j]
inertiaLocal += diff * diff
}
inertia <- inertiaLocal
}(i)
}
go func() {
wg.Wait()
close(inertia)
}()
totalInertia := 0.0
for localInertia := range inertia {
totalInertia += localInertia
}
return totalInertia
}
// Kmeans performs the K-means clustering algorithm on the given data.
//
// if centers and *center is not nil, centers is considered as initialized
// and the number of classes (k) is set to the number of rows in centers.
// overwise, the number of classes is defined by the value of k.
//
// Parameters:
// - data: A pointer to a Matrix[float64] that represents the input data.
// - k: An integer that specifies the number of clusters to create.
// - threshold: A float64 value that determines the convergence threshold.
// - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
//
// Returns:
// - classes: A slice of integers that assigns each data point to a cluster.
// - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
// - inertia: A float64 value that represents the overall inertia of the clustering.
// - converged: A boolean value indicating whether the algorithm converged.
func Kmeans(data *obiutils.Matrix[float64],
k int,
threshold float64,
centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
if centers == nil || *centers == nil {
*centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
center_ids := SampleIntWithoutReplacement(k, len(*data))
for i, id := range center_ids {
(*centers)[i] = (*data)[id]
}
} else {
k = len(*centers)
}
classes := AssignToClass(data, centers)
centers = ComputeCenters(data, k, classes)
inertia := ComputeInertia(data, classes, centers)
delta := threshold * 100.0
for i := 0; i < 100 && delta > threshold; i++ {
classes = AssignToClass(data, centers)
centers = ComputeCenters(data, k, classes)
newi := ComputeInertia(data, classes, centers)
delta = inertia - newi
inertia = newi
log.Debugf("Inertia: %f, delta: %f", inertia, delta)
}
return classes, centers, inertia, delta < threshold
}
// KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel.
//
// It takes a matrix of data points and a matrix of centers as input.
// The best representative is the data point that is closest to the center of the cluster.
// Returns an array of integers containing the index of the best representative for each cluster.
func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
bestRepresentative := make([]int, len(*centers))
var wg sync.WaitGroup
wg.Add(len(*centers))
for j, center := range *centers {
go func(j int, center []float64) {
defer wg.Done()
bestDistToCenter := math.MaxFloat64
best := -1
for i, row := range *data {
dist := 0.0
for d, val := range row {
diff := val - center[d]
dist += diff * diff
}
if dist < bestDistToCenter {
bestDistToCenter = dist
best = i
dkmin := math.MaxFloat64
dki := -1
center := centers[centerIdx]
for j, row := range *data {
if classes[j] == centerIdx {
dist := squareDist(row, center)
if dist < dkmin {
dkmin = dist
dki = j
}
}
if best == -1 {
log.Fatalf("No representative found for cluster %d", j)
}
clustering.icenters[centerIdx] = dki
clustering.centers[centerIdx] = (*data)[dki]
}
bestRepresentative[j] = best
}(j, center)
for i := 0; i < k; i++ {
wg.Add(1)
go goroutine2(i)
}
wg.Wait()
return bestRepresentative
}
func (clustering *KmeansClustering) Inertia() float64 {
inertia := 0.0
for i := 0; i < clustering.N(); i++ {
inertia += clustering.distmin[i]
}
return inertia
}
func (clustering *KmeansClustering) Centers() obiutils.Matrix[float64] {
return clustering.centers
}
func (clustering *KmeansClustering) CentersIndices() []int {
return clustering.icenters
}
func (clustering *KmeansClustering) Sizes() []int {
return clustering.sizes
}
func (clustering *KmeansClustering) Classes() []int {
return clustering.classes
}
func (clustering *KmeansClustering) Run(max_cycle int, threshold float64) bool {
prev := math.MaxFloat64
newI := clustering.Inertia()
for i := 0; i < max_cycle && (prev-newI) > threshold; i++ {
prev = newI
clustering.AssignToClass()
clustering.ComputeCenters()
newI = clustering.Inertia()
}
return (prev - newI) <= threshold
}
// // Kmeans performs the K-means clustering algorithm on the given data.
// // if centers and *center is not nil, centers is considered as initialized
// // and the number of classes (k) is set to the number of rows in centers.
// // overwise, the number of classes is defined by the value of k.
// // Parameters:
// // - data: A pointer to a Matrix[float64] that represents the input data.
// // - k: An integer that specifies the number of clusters to create.
// // - threshold: A float64 value that determines the convergence threshold.
// // - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
// // Returns:
// // - classes: A slice of integers that assigns each data point to a cluster.
// // - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
// // - inertia: A float64 value that represents the overall inertia of the clustering.
// // - converged: A boolean value indicating whether the algorithm converged.
// func Kmeans(data *obiutils.Matrix[float64],
// k int,
// threshold float64,
// centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
// if centers == nil || *centers == nil {
// *centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
// center_ids := SampleIntWithoutReplacement(k, len(*data))
// for i, id := range center_ids {
// (*centers)[i] = (*data)[id]
// }
// } else {
// k = len(*centers)
// }
// classes := AssignToClass(data, centers)
// centers = ComputeCenters(data, k, classes)
// inertia := ComputeInertia(data, classes, centers)
// delta := threshold * 100.0
// for i := 0; i < 100 && delta > threshold; i++ {
// classes = AssignToClass(data, centers)
// centers = ComputeCenters(data, k, classes)
// newi := ComputeInertia(data, classes, centers)
// delta = inertia - newi
// inertia = newi
// log.Debugf("Inertia: %f, delta: %f", inertia, delta)
// }
// return classes, centers, inertia, delta < threshold
// }
// // KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel.
// //
// // It takes a matrix of data points and a matrix of centers as input.
// // The best representative is the data point that is closest to the center of the cluster.
// // Returns an array of integers containing the index of the best representative for each cluster.
// func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
// bestRepresentative := make([]int, len(*centers))
// var wg sync.WaitGroup
// wg.Add(len(*centers))
// for j, center := range *centers {
// go func(j int, center []float64) {
// defer wg.Done()
// bestDistToCenter := math.MaxFloat64
// best := -1
// for i, row := range *data {
// dist := 0.0
// for d, val := range row {
// diff := val - center[d]
// dist += diff * diff
// }
// if dist < bestDistToCenter {
// bestDistToCenter = dist
// best = i
// }
// }
// if best == -1 {
// log.Fatalf("No representative found for cluster %d", j)
// }
// bestRepresentative[j] = best
// }(j, center)
// }
// wg.Wait()
// return bestRepresentative
// }

64
pkg/obitable/table.go Normal file
View File

@ -0,0 +1,64 @@
// obitable provide a row oriented data table structure
package obitable
import (
"reflect"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/chen3feng/stl4go"
)
type Header stl4go.Ordered
type Row map[string]interface{}
type Table struct {
ColType map[string]reflect.Type
Rows []Row
}
type RowGetter func(name string) interface{}
func RowFromMap(data map[string]interface{}, navalue string) RowGetter {
getter := func(name string) interface{} {
value, ok := data[name]
if !ok {
value = navalue
}
return value
}
return getter
}
func RowFromBioSeq(data *obiseq.BioSequence, navalue string) RowGetter {
getter := func(name string) interface{} {
var value interface{}
value = navalue
switch name {
case "id":
value = data.Id()
case "sequence":
value = data.Sequence()
case "definition":
value = data.Definition()
case "taxid":
value = data.Taxid()
case "count":
value = data.Count()
default:
if data.HasAnnotation() {
var ok bool
value, ok = data.GetAttribute(name)
if !ok {
value = navalue
}
}
}
return value
}
return getter
}

View File

@ -124,6 +124,9 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
}
// classes, centers := obistats.Kmeans(&seqworld, n_landmark, &initialCenters)
classifier := obistats.MakeKmeansClustering(&seqworld, n_landmark, obistats.DefaultRG())
_, centers, inertia, converged := classifier.Run(1000, 0.001)
intertia := classifier.Inertia()
_, centers, inertia, converged := obistats.Kmeans(&seqworld, n_landmark, 0.001, &initialCenters)
dist_centers := 0.0

BIN
sample/.DS_Store vendored

Binary file not shown.

11
sample/AY189646 Normal file
View File

@ -0,0 +1,11 @@
==> db_v05_idx_ori.fasta <==
>AY189646 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"9526@Catarrhini@parvorder","12":"1437010@Boreoeutheria@clade","16":"9347@Eutheria@clade","17":"40674@Mammalia@class","2":"207598@Homininae@subfamily","22":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","29":"33213@Bilateria@clade","3":"9604@Hominidae@family","30":"6072@Eumetazoa@clade","8":"314295@Hominoidea@superfamily"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan119 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
ttagccctaaacctcaacagttaaatcaacaaaactgctcgccagaacactacgrgccac
agcttaaaactcaaaggacctggcggtgcttcatatccct
>AY189647 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","11":"314295@Hominoidea@superfamily","12":"9526@Catarrhini@parvorder","15":"1437010@Boreoeutheria@clade","18":"9347@Eutheria@clade","19":"40674@Mammalia@class","21":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","30":"33213@Bilateria@clade","31":"6072@Eumetazoa@clade","5":"207598@Homininae@subfamily","6":"9604@Hominidae@family"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan120 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
ttagccctaaacctcaacagttaaatcaacaaaaacwsctcgccagaacactacgagtca
cagcttaaaactcaaaggacctggcggtgcttcatatccct
>AC008434 {"count":94,"obitag_ref_index":{"0":"207598@Homininae@subfamily","12":"1437010@Boreoeutheria@clade","16":"40674@Mammalia@class","2":"9604@Hominidae@family","21":"117571@Euteleostomi@clade","23":"7776@Gnathostomata@clade","28":"33213@Bilateria@clade","29":"6072@Eumetazoa@clade","7":"314295@Hominoidea@superfamily","9":"9526@Catarrhini@parvorder"},"species_name":"###","taxid":207598} Homo sapiens chromosome 5 clone CTC-325J23, complete sequence.
ttagccctaaacttcaacagttaaattaacaaaactgctcgccagaacactacgagccac
agcttaaaactcaaaggacctggcggtgcttcatatccct
>AC008512 {"count":5,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"1437010@Boreoeutheria@clade","14":"40674@Mammalia@class","18":"117571@Euteleostomi@clade","2":"207598@Homininae@subfamily","22":"7776@Gnathostomata@clade","26":"33213@Bilateria@clade","28":"6072@Eumetazoa@clade","7":"9526@Catarrhini@parvorder"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens chromosome 5 clone CTC-454K8, complete sequence.

91072
sample/Euka05_230327.xxx Normal file

File diff suppressed because it is too large Load Diff

91072
sample/Euka05_230327_frg.xxx Normal file

File diff suppressed because it is too large Load Diff

11
sample/FJ465692 Normal file
View File

@ -0,0 +1,11 @@
==> db_v05_idx_ori.fasta <==
>AY189646 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"9526@Catarrhini@parvorder","12":"1437010@Boreoeutheria@clade","16":"9347@Eutheria@clade","17":"40674@Mammalia@class","2":"207598@Homininae@subfamily","22":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","29":"33213@Bilateria@clade","3":"9604@Hominidae@family","30":"6072@Eumetazoa@clade","8":"314295@Hominoidea@superfamily"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan119 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
ttagccctaaacctcaacagttaaatcaacaaaactgctcgccagaacactacgrgccac
agcttaaaactcaaaggacctggcggtgcttcatatccct
>AY189647 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","11":"314295@Hominoidea@superfamily","12":"9526@Catarrhini@parvorder","15":"1437010@Boreoeutheria@clade","18":"9347@Eutheria@clade","19":"40674@Mammalia@class","21":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","30":"33213@Bilateria@clade","31":"6072@Eumetazoa@clade","5":"207598@Homininae@subfamily","6":"9604@Hominidae@family"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan120 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
ttagccctaaacctcaacagttaaatcaacaaaaacwsctcgccagaacactacgagtca
cagcttaaaactcaaaggacctggcggtgcttcatatccct
>AC008434 {"count":94,"obitag_ref_index":{"0":"207598@Homininae@subfamily","12":"1437010@Boreoeutheria@clade","16":"40674@Mammalia@class","2":"9604@Hominidae@family","21":"117571@Euteleostomi@clade","23":"7776@Gnathostomata@clade","28":"33213@Bilateria@clade","29":"6072@Eumetazoa@clade","7":"314295@Hominoidea@superfamily","9":"9526@Catarrhini@parvorder"},"species_name":"###","taxid":207598} Homo sapiens chromosome 5 clone CTC-325J23, complete sequence.
ttagccctaaacttcaacagttaaattaacaaaactgctcgccagaacactacgagccac
agcttaaaactcaaaggacctggcggtgcttcatatccct
>AC008512 {"count":5,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"1437010@Boreoeutheria@clade","14":"40674@Mammalia@class","18":"117571@Euteleostomi@clade","2":"207598@Homininae@subfamily","22":"7776@Gnathostomata@clade","26":"33213@Bilateria@clade","28":"6072@Eumetazoa@clade","7":"9526@Catarrhini@parvorder"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens chromosome 5 clone CTC-454K8, complete sequence.

BIN
sample/STD_PLN_1.dat.gz Normal file

Binary file not shown.

BIN
sample/heap.out Normal file

Binary file not shown.

0
sample/heap.out2 Normal file
View File

15841
sample/xx.header Normal file

File diff suppressed because it is too large Load Diff

15841
sample/xx.sort.header Normal file

File diff suppressed because it is too large Load Diff

5805
sample/yy.header Normal file

File diff suppressed because it is too large Load Diff

5805
sample/yy.sort.header Normal file

File diff suppressed because it is too large Load Diff

537
test.go Normal file
View File

@ -0,0 +1,537 @@
package main
import (
"bytes"
"fmt"
"io"
"log"
"regexp"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/gabriel-vasile/mimetype"
"golang.org/x/exp/slices"
)
func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
fastaDetector := func(raw []byte, limit uint32) bool {
ok, err := regexp.Match("^>[^ ]", raw)
return ok && err == nil
}
fastqDetector := func(raw []byte, limit uint32) bool {
ok, err := regexp.Match("^@[^ ]", raw)
return ok && err == nil
}
ecoPCR2Detector := func(raw []byte, limit uint32) bool {
ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2"))
return ok
}
genbankDetector := func(raw []byte, limit uint32) bool {
ok2 := bytes.HasPrefix(raw, []byte("LOCUS "))
ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw)
return ok2 || (ok1 && err == nil)
}
emblDetector := func(raw []byte, limit uint32) bool {
ok := bytes.HasPrefix(raw, []byte("ID "))
return ok
}
mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta")
mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq")
mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")
// Create a buffer to store the read data
buf := make([]byte, 1024*128)
n, err := stream.Read(buf)
if err != nil && err != io.EOF {
return nil, nil, err
}
// Detect the MIME type using the mimetype library
mimeType := mimetype.Detect(buf)
if mimeType == nil {
return nil, nil, err
}
// Create a new reader based on the read data
newReader := io.MultiReader(bytes.NewReader(buf[:n]), stream)
return mimeType, newReader, nil
}
var xxx1 = `00422_612GNAAXX:7:73:6614:3284#0/1
ccgaatatcttagataccccactatgcttagccctaaacataaacattcaataaacaaga
atgttcgccagagtactactagcaacagcctgaaactcaaagcacttg
>HELIUM_000100422_612GNAAXX:7:13:11063:8138#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattattataacaaa
attattcgccagagtactaccggcaatagcttaaaactcacagaactt
>HELIUM_000100422_612GNAAXX:7:2:7990:17026#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgcttt
tcaggctgttgctagtagtactctggcgaccattcttgtttattgatt
>HELIUM_000100422_612GNAAXX:7:3:19649:11224#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:3:8446:7884#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:108:8714:2464#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattaatataacaaa
attattcgccagagtactaccggcaatagcttaaaactcaaaggactt
>HELIUM_000100422_612GNAAXX:7:28:3969:15209#0/1
ccaattaacttagataccccactatgcctagccttaaacacaaatagttatgcaaacaaa
actattcgccagagtactaccggcaatagcttaaaactcaacgcactg
>HELIUM_000100422_612GNAAXX:7:44:3269:3608#0/1
gaagtagtagaacaggctcctctagaagggt`
var xxx2 = `>HELIUM_000100422_612GNAAXX:7:13:11063:8138#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattattataacaaa
attattcgccagagtactaccggcaatagcttaaaactcacagaactt
>HELIUM_000100422_612GNAAXX:7:2:7990:17026#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgcttt
tcaggctgttgctagtagtactctggcgaccattcttgtttattgatt
>HELIUM_000100422_612GNAAXX:7:3:19649:11224#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:3:8446:7884#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:108:8714:2464#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattaatataacaaa
attattcgccagagtactaccggcaatagcttaaaactcaaaggactt
>HELIUM_000100422_612GNAAXX:7:28:3969:15209#0/1
ccaattaacttagataccccactatgcctagccttaaacacaaatagttatgcaaacaaa
actattcgccagagtactaccggcaatagcttaaaactcaacgcactg
>HELIUM_000100422_612GNAAXX:7:44:3269:3608#0/1
gaagtagtagaacaggctcctctagaagggt`
var xxx3 = `00422_612GNAAXX:7:73:6614:3284#0/1
ccgaatatcttagataccccactatgcttagccctaaacataaacattcaataaacaaga
atgttcgccagagtactactagcaacagcctgaaactcaaagcacttg
>HELIUM_000100422_612GNAAXX:7:13:11063:8138#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattattataacaaa
attattcgccagagtactaccggcaatagcttaaaactcacagaactt
>HELIUM_000100422_612GNAAXX:7:2:7990:17026#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgcttt
tcaggctgttgctagtagtactctggcgaccattcttgtttattgatt
>HELIUM_000100422_612GNAAXX:7:3:19649:11224#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:3:8446:7884#0/1
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
>HELIUM_000100422_612GNAAXX:7:108:8714:2464#0/1
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattaatataacaaa
attattcgccagagtactaccggcaatagcttaaaactcaaaggactt
>HELIUM_000100422_612GNAAXX:7:28:3969:15209#0/1
ccaattaacttagataccccactatgcctagccttaaacacaaatagttatgcaaacaaa
actattcgccagagtactaccggcaatagcttaaaactcaacgcactg`
var yyy1 = `@HELIUM_000100422_612GNAAXX:7:1:9007:3289#0/1 {"demultiplex_error":"cannot assign the sequence to a sample"}
ccatctctcttagataccccactatgcttagccctaaacacaagtaattaatataacaaaattattcgccagagtactaccggcaatagcttaaaactcaaagaactc
+
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCACCCCCCCCCBCCACC779?############################################
@HELIUM_000100422_612GNAAXX:7:1:8849:9880#0/1 {"demultiplex_error":"cannot match any primer pair"}
gatcggaagagcggttcagcaggaatgccgagaccgatatcgtatgccgtcttctgcttgaaaaaaaaaacaaaataggagagtagactcactgccagtggtcgtcag
`
func LastFastqCut(buffer []byte) ([]byte, []byte) {
imax := len(buffer)
cut := imax
state := 0
restart := imax - 1
for i := restart; i >= 0 && state < 7; i-- {
C := buffer[i]
is_end_of_line := C == '\r' || C == '\n'
is_space := C == ' ' || C == '\t'
is_sep := is_space || is_end_of_line
switch state {
case 0:
if C == '+' {
// Potential start of quality part step 1
state = 1
restart = i
}
case 1:
if is_end_of_line {
// Potential start of quality part step 2
state = 2
} else {
// it was not the start of quality part
state = 0
i = restart
}
case 2:
if is_sep {
// Potential start of quality part step 2 (stay in the same state)
state = 2
} else if (C >= 'a' && C <= 'z') || C == '-' || C == '.' {
// End of the sequence
state = 3
} else {
// it was not the start of quality part
state = 0
i = restart
}
case 3:
if is_end_of_line {
// Entrering in the header line
state = 4
} else if (C >= 'a' && C <= 'z') || C == '-' || C == '.' {
// progressing along of the sequence
state = 3
} else {
// it was not the sequence part
state = 0
i = restart
}
case 4:
if is_end_of_line {
state = 4
} else {
state = 5
}
case 5:
if is_end_of_line {
// It was not the header line
state = 0
i = restart
} else if C == '@' {
state = 6
cut = i
}
case 6:
if is_end_of_line {
state = 7
} else {
state = 0
i = restart
}
}
}
if state == 7 {
return buffer[:cut], bytes.Clone(buffer[cut:])
}
return []byte{}, buffer
}
func LastSequenceCut(buffer []byte) ([]byte, []byte) {
imax := len(buffer)
last := 0
state := 0
for i := imax - 1; i >= 0 && state < 2; i-- {
if state == 0 && buffer[i] == '>' {
state = 1
last = i
} else if state == 1 && (buffer[i] == '\r' || buffer[i] == '\n') {
state = 2
} else {
state = 0
}
}
if state == 2 {
return buffer[:last], bytes.Clone(buffer[last:])
}
return []byte{}, buffer
}
func FirstSequenceCut(buffer []byte) ([]byte, []byte) {
imax := len(buffer)
last := 0
state := 0
for i := 0; i < imax && state < 2; i++ {
if (state == 0 || state == 1) && (buffer[i] == '\r' || buffer[i] == '\n') {
state = 1
} else if (state == 1 || i == 0) && buffer[i] == '>' {
state = 2
last = i
} else {
state = 0
}
}
if state == 2 {
return bytes.Clone(buffer[:last]), buffer[last:]
}
return buffer, []byte{}
}
func FullSequenceCut(buffer []byte) ([]byte, []byte, []byte) {
before, buffer := FirstSequenceCut(buffer)
if len(buffer) == 0 {
return before, []byte{}, []byte{}
}
buffer, after := LastSequenceCut(buffer)
return before, buffer, after
}
func Concatenate[S ~[]E, E any](s1, s2 S) S {
if len(s1) > 0 {
if len(s2) > 0 {
return append(s1[:len(s1):len(s1)], s2...)
}
return s1
}
return s2
}
type FastxChunk struct {
Bytes []byte
index int
}
func FastaChunkReader(r io.Reader, size int, cutHead bool) (chan FastxChunk, error) {
out := make(chan FastxChunk)
buff := make([]byte, size)
n, err := r.Read(buff)
if n > 0 && err == nil {
if n < size {
buff = buff[:n]
}
begin, buff := FirstSequenceCut(buff)
if len(begin) > 0 && !cutHead {
return out, fmt.Errorf("begin is not empty : %s", string(begin))
}
go func(buff []byte) {
idx := 0
end := []byte{}
for err == nil && n > 0 {
// fmt.Println("============end=========================")
// fmt.Println(string(end))
// fmt.Println("------------buff------------------------")
// fmt.Println(string(buff))
buff = Concatenate(end, buff)
// fmt.Println("------------buff--pasted----------------")
// fmt.Println(string(buff))
buff, end = LastSequenceCut(buff)
// fmt.Println("----------------buff--cutted------------")
// fmt.Println(string(buff))
// fmt.Println("------------------end-------------------")
// fmt.Println(string(end))
// fmt.Println("========================================")
if len(buff) > 0 {
out <- FastxChunk{
Bytes: bytes.Clone(buff),
index: idx,
}
idx++
} else {
fmt.Println("***** Empty buff *****")
}
buff = slices.Grow(buff[:0], size)[0:size]
n, err = r.Read(buff)
if n < size {
buff = buff[:n]
}
// fmt.Printf("n = %d, err = %v\n", n, err)
}
if len(end) > 0 {
out <- FastxChunk{
Bytes: bytes.Clone(end),
index: idx,
}
}
close(out)
}(buff)
}
return out, nil
}
func ParseFastaChunk(ch FastxChunk) *obiiter.BioSequenceBatch {
fmt.Println(string(ch.Bytes))
slice := make(obiseq.BioSequenceSlice, 0, obioptions.CLIBatchSize())
state := 0
start := 0
current := 0
var identifier string
var definition string
for i := 0; i < len(ch.Bytes); i++ {
C := ch.Bytes[i]
is_end_of_line := C == '\r' || C == '\n'
is_space := C == ' ' || C == '\t'
is_sep := is_space || is_end_of_line
switch state {
case 0:
if C == '>' {
// Beginning of sequence
state = 1
}
case 1:
if is_sep {
// No identifier -> ERROR
return nil
} else {
// Beginning of identifier
state = 2
start = i
}
case 2:
if is_sep {
// End of identifier
identifier = string(ch.Bytes[start:i])
state = 3
}
case 3:
if is_end_of_line {
// Definition empty
definition = ""
state = 5
} else if !is_space {
// Beginning of definition
start = i
state = 4
}
case 4:
if is_end_of_line {
definition = string(ch.Bytes[start:i])
state = 5
}
case 5:
if !is_end_of_line {
// Beginning of sequence
start = i
current = i
state = 6
}
case 6:
if C == '>' {
// End of sequence
s := obiseq.NewBioSequence(identifier, bytes.Clone(ch.Bytes[start:current]), definition)
slice = append(slice, s)
state = 1
} else if !is_sep {
if C >= 'A' && C <= 'Z' {
C = C + 'a' - 'A'
}
// Removing white space from the sequence
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' {
ch.Bytes[current] = C
current++
}
}
}
}
fmt.Printf("Index = %d, State = %d\n", ch.index, state)
slice = append(slice, obiseq.NewBioSequence(identifier, bytes.Clone(ch.Bytes[start:current]), definition))
batch := obiiter.MakeBioSequenceBatch(ch.index, slice)
return &batch
}
func ReadFastaSequence(reader io.Reader) obiiter.IBioSequence {
out := obiiter.MakeIBioSequence()
nworker := obioptions.CLIReadParallelWorkers()
out.Add(nworker)
chkchan, err := FastaChunkReader(reader, 1024*500, false)
if err != nil {
log.Panicln("Error:", err)
}
go func() {
out.WaitAndClose()
}()
parser := func() {
defer out.Done()
for chk := range chkchan {
seqs := ParseFastaChunk(chk)
if seqs != nil {
out.Push(*seqs)
}
}
}
for i := 0; i < nworker; i++ {
go parser()
}
return out.SortBatches().Rebatch(obioptions.CLIBatchSize())
}
func main() {
// if len(os.Args) != 2 {
// fmt.Println("Usage: go run main.go <filename>")
// return
// }
// filename := os.Args[1]
// filename := "100.fasta"
// file, err := os.Open(filename)
// if err != nil {
// fmt.Println("Error:", err)
// return
// }
// defer file.Close()
// mimeType, input, err := OBIMimeTypeGuesser(file)
// if err != nil {
// fmt.Println("Error:", err)
// return
// }
// fmt.Println("Detected MIME Type:", mimeType.String())
// ch, err := FastaChunkReader(input, 1024, false)
// if err != nil {
// fmt.Println("Error:", err)
// return
// }
// for chk := range ch {
// fmt.Printf("--------------------\n")
// b := ParseFastaChunk(chk)
// fmt.Printf("-------- %d --------\n", b.Order())
// for _, b := range b.Slice() {
// fmt.Printf("--%s--\t--%s--\t--%s--\n", b.Id(), b.Definition(), b.String())
// }
// }
d1, f1 := LastFastqCut([]byte(yyy1))
// d2, f2 := LastSequenceCut([]byte(xxx2))
// d3, f3 := LastSequenceCut([]byte(xxx3))
fmt.Println("Last Sequence Cut 1:", string(d1), "---", string(f1))
// fmt.Println("Last Sequence Cut 2:", string(d2), "---", string(f2))
// fmt.Println("Last Sequence Cut 3:", string(d3), "---", string(f3))
// d1, b1, f1 := FullSequenceCut([]byte(xxx1))
// d2, b2, f2 := FullSequenceCut([]byte(xxx2))
// d3, b3, f3 := FullSequenceCut([]byte(xxx3))
// fmt.Println("Last Sequence Cut:", string(d1), "---", string(b1), "---", string(f1))
// fmt.Println("Last Sequence Cut:", string(d2), "---", string(b2), "---", string(f2))
// fmt.Println("Last Sequence Cut:", string(d3), "---", string(b3), "---", string(f3))
// Now you can use "extractedData" to access the read data with the associated MIME type.
// For example, you can copy the data into a buffer for further manipulation.
}

View File

@ -0,0 +1 @@
061a528427b8ecb0b30df8e7923edc4220443ade

45277
toto.csv Normal file

File diff suppressed because it is too large Load Diff

BIN
toto.csv.gz Normal file

Binary file not shown.

9322
toto.json Normal file

File diff suppressed because it is too large Load Diff

BIN
wolf_R1.fasta.gz Normal file

Binary file not shown.

BIN
wolf_R2.fasta.gz Normal file

Binary file not shown.

6
wolf_count.csv Normal file
View File

@ -0,0 +1,6 @@
id,HELIUM_000100422_612GNAAXX:7:100:4828:3492#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:113:17236:15166#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:16:12111:9453#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:17:3675:13316#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:20:15729:20493#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:22:2603:18023#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:25:11714:14251#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:2:15508:17530#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:30:17945:19531#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:34:14122:13731#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:34:17680:16952#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:52:12776:11698#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:53:17880:8617#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:58:11419:17203#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:5:15939:5437#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:66:4039:8016#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:69:15276:10367#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:72:17638:8081#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:7:18108:9040#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:7:2880:4021#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:80:10626:19388#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:81:18704:12346#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:84:14502:1617#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:84:16335:5083#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:94:16908:11285#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:9:9623:15395#0/1_sub[28..127]
15a_F730814,0,0,0,0,0,9165,0,0,0,0,0,0,0,5,0,0,0,0,4,0,0,0,0,0,0,0
13a_F730603,22,0,1,0,0,0,0,0,0,0,15,0,0,19,0,25,0,0,9,0,20,0,0,0,8409,0
29a_F260619,0,0,13,0,16,6139,0,0,0,0,0,0,0,1,0,0,25,0,0,44,0,0,391,110,0,353
26a_F040644,0,72,0,17,0,0,14,18,43,31,0,52,88,481,12830,0,0,15,0,0,0,208,0,14,0,0
1 id HELIUM_000100422_612GNAAXX:7:100:4828:3492#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:113:17236:15166#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:16:12111:9453#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:17:3675:13316#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:20:15729:20493#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:22:2603:18023#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:25:11714:14251#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:2:15508:17530#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:30:17945:19531#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:34:14122:13731#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:34:17680:16952#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:52:12776:11698#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:53:17880:8617#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:58:11419:17203#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:5:15939:5437#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:66:4039:8016#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:69:15276:10367#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:72:17638:8081#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:7:18108:9040#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:7:2880:4021#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:80:10626:19388#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:81:18704:12346#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:84:14502:1617#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:84:16335:5083#0/1_sub[28..126] HELIUM_000100422_612GNAAXX:7:94:16908:11285#0/1_sub[28..127] HELIUM_000100422_612GNAAXX:7:9:9623:15395#0/1_sub[28..127]
2 15a_F730814 0 0 0 0 0 9165 0 0 0 0 0 0 0 5 0 0 0 0 4 0 0 0 0 0 0 0
3 13a_F730603 22 0 1 0 0 0 0 0 0 0 15 0 0 19 0 25 0 0 9 0 20 0 0 0 8409 0
4 29a_F260619 0 0 13 0 16 6139 0 0 0 0 0 0 0 1 0 0 25 0 0 44 0 0 391 110 0 353
5 26a_F040644 0 72 0 17 0 0 14 18 43 31 0 52 88 481 12830 0 0 15 0 0 0 208 0 14 0 0

4
wolf_diet_ngsfilter.txt Executable file
View File

@ -0,0 +1,4 @@
wolf_diet 13a_F730603 aattaac TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @
wolf_diet 13b_F730603 gaagtag TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @
wolf_diet 26a_F040644 gaatatc TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @
wolf_diet 29a_F260619 gcctcct TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @

181104
xx Normal file

File diff suppressed because it is too large Load Diff

1
xxx.gb.REMOVED.git-id Normal file
View File

@ -0,0 +1 @@
b92f63b34879105e61db0faf868f2828b4db298d

77043
yyy.json Normal file

File diff suppressed because it is too large Load Diff

1860
yyy_LCS.csv Normal file

File diff suppressed because it is too large Load Diff

1104
yyy_geom.csv Normal file

File diff suppressed because it is too large Load Diff